1 /* 2 3 Unicode implementation based on original code by Fredrik Lundh, 4 modified by Marc-Andre Lemburg <mal (at) lemburg.com>. 5 6 Major speed upgrades to the method implementations at the Reykjavik 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9 Copyright (c) Corporation for National Research Initiatives. 10 11 -------------------------------------------------------------------- 12 The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17 By obtaining, using, and/or copying this software and/or its 18 associated documentation, you agree that you have read, understood, 19 and will comply with the following terms and conditions: 20 21 Permission to use, copy, modify, and distribute this software and its 22 associated documentation for any purpose and without fee is hereby 23 granted, provided that the above copyright notice appears in all 24 copies, and that both that copyright notice and this permission notice 25 appear in supporting documentation, and that the name of Secret Labs 26 AB or the author not be used in advertising or publicity pertaining to 27 distribution of the software without specific, written prior 28 permission. 29 30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 -------------------------------------------------------------------- 38 39 */ 40 41 #define PY_SSIZE_T_CLEAN 42 #include "Python.h" 43 #include "ucnhash.h" 44 #include "bytes_methods.h" 45 #include "stringlib/eq.h" 46 47 #ifdef MS_WINDOWS 48 #include <windows.h> 49 #endif 50 51 /*[clinic input] 52 class str "PyUnicodeObject *" "&PyUnicode_Type" 53 [clinic start generated code]*/ 54 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 55 56 /* --- Globals ------------------------------------------------------------ 57 58 NOTE: In the interpreter's initialization phase, some globals are currently 59 initialized dynamically as needed. In the process Unicode objects may 60 be created before the Unicode type is ready. 61 62 */ 63 64 65 #ifdef __cplusplus 66 extern "C" { 67 #endif 68 69 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 70 #define MAX_UNICODE 0x10ffff 71 72 #ifdef Py_DEBUG 73 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 74 #else 75 # define _PyUnicode_CHECK(op) PyUnicode_Check(op) 76 #endif 77 78 #define _PyUnicode_UTF8(op) \ 79 (((PyCompactUnicodeObject*)(op))->utf8) 80 #define PyUnicode_UTF8(op) \ 81 (assert(_PyUnicode_CHECK(op)), \ 82 assert(PyUnicode_IS_READY(op)), \ 83 PyUnicode_IS_COMPACT_ASCII(op) ? \ 84 ((char*)((PyASCIIObject*)(op) + 1)) : \ 85 _PyUnicode_UTF8(op)) 86 #define _PyUnicode_UTF8_LENGTH(op) \ 87 (((PyCompactUnicodeObject*)(op))->utf8_length) 88 #define PyUnicode_UTF8_LENGTH(op) \ 89 (assert(_PyUnicode_CHECK(op)), \ 90 assert(PyUnicode_IS_READY(op)), \ 91 PyUnicode_IS_COMPACT_ASCII(op) ? \ 92 ((PyASCIIObject*)(op))->length : \ 93 _PyUnicode_UTF8_LENGTH(op)) 94 #define _PyUnicode_WSTR(op) \ 95 (((PyASCIIObject*)(op))->wstr) 96 #define _PyUnicode_WSTR_LENGTH(op) \ 97 (((PyCompactUnicodeObject*)(op))->wstr_length) 98 #define _PyUnicode_LENGTH(op) \ 99 (((PyASCIIObject *)(op))->length) 100 #define _PyUnicode_STATE(op) \ 101 (((PyASCIIObject *)(op))->state) 102 #define _PyUnicode_HASH(op) \ 103 (((PyASCIIObject *)(op))->hash) 104 #define _PyUnicode_KIND(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 ((PyASCIIObject *)(op))->state.kind) 107 #define _PyUnicode_GET_LENGTH(op) \ 108 (assert(_PyUnicode_CHECK(op)), \ 109 ((PyASCIIObject *)(op))->length) 110 #define _PyUnicode_DATA_ANY(op) \ 111 (((PyUnicodeObject*)(op))->data.any) 112 113 #undef PyUnicode_READY 114 #define PyUnicode_READY(op) \ 115 (assert(_PyUnicode_CHECK(op)), \ 116 (PyUnicode_IS_READY(op) ? \ 117 0 : \ 118 _PyUnicode_Ready(op))) 119 120 #define _PyUnicode_SHARE_UTF8(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 124 #define _PyUnicode_SHARE_WSTR(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 127 128 /* true if the Unicode object has an allocated UTF-8 memory block 129 (not shared with other data) */ 130 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 131 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 132 && _PyUnicode_UTF8(op) \ 133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 134 135 /* true if the Unicode object has an allocated wstr memory block 136 (not shared with other data) */ 137 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 138 ((_PyUnicode_WSTR(op) && \ 139 (!PyUnicode_IS_READY(op) || \ 140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 141 142 /* Generic helper macro to convert characters of different types. 143 from_type and to_type have to be valid type names, begin and end 144 are pointers to the source characters which should be of type 145 "from_type *". to is a pointer of type "to_type *" and points to the 146 buffer where the result characters are written to. */ 147 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 148 do { \ 149 to_type *_to = (to_type *)(to); \ 150 const from_type *_iter = (from_type *)(begin); \ 151 const from_type *_end = (from_type *)(end); \ 152 Py_ssize_t n = (_end) - (_iter); \ 153 const from_type *_unrolled_end = \ 154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 155 while (_iter < (_unrolled_end)) { \ 156 _to[0] = (to_type) _iter[0]; \ 157 _to[1] = (to_type) _iter[1]; \ 158 _to[2] = (to_type) _iter[2]; \ 159 _to[3] = (to_type) _iter[3]; \ 160 _iter += 4; _to += 4; \ 161 } \ 162 while (_iter < (_end)) \ 163 *_to++ = (to_type) *_iter++; \ 164 } while (0) 165 166 #ifdef MS_WINDOWS 167 /* On Windows, overallocate by 50% is the best factor */ 168 # define OVERALLOCATE_FACTOR 2 169 #else 170 /* On Linux, overallocate by 25% is the best factor */ 171 # define OVERALLOCATE_FACTOR 4 172 #endif 173 174 /* This dictionary holds all interned unicode strings. Note that references 175 to strings in this dictionary are *not* counted in the string's ob_refcnt. 176 When the interned string reaches a refcnt of 0 the string deallocation 177 function will delete the reference from this dictionary. 178 179 Another way to look at this is that to say that the actual reference 180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 181 */ 182 static PyObject *interned = NULL; 183 184 /* The empty Unicode object is shared to improve performance. */ 185 static PyObject *unicode_empty = NULL; 186 187 #define _Py_INCREF_UNICODE_EMPTY() \ 188 do { \ 189 if (unicode_empty != NULL) \ 190 Py_INCREF(unicode_empty); \ 191 else { \ 192 unicode_empty = PyUnicode_New(0, 0); \ 193 if (unicode_empty != NULL) { \ 194 Py_INCREF(unicode_empty); \ 195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 196 } \ 197 } \ 198 } while (0) 199 200 #define _Py_RETURN_UNICODE_EMPTY() \ 201 do { \ 202 _Py_INCREF_UNICODE_EMPTY(); \ 203 return unicode_empty; \ 204 } while (0) 205 206 /* Forward declaration */ 207 static inline int 208 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 209 210 /* List of static strings. */ 211 static _Py_Identifier *static_strings = NULL; 212 213 /* Single character Unicode strings in the Latin-1 range are being 214 shared as well. */ 215 static PyObject *unicode_latin1[256] = {NULL}; 216 217 /* Fast detection of the most frequent whitespace characters */ 218 const unsigned char _Py_ascii_whitespace[] = { 219 0, 0, 0, 0, 0, 0, 0, 0, 220 /* case 0x0009: * CHARACTER TABULATION */ 221 /* case 0x000A: * LINE FEED */ 222 /* case 0x000B: * LINE TABULATION */ 223 /* case 0x000C: * FORM FEED */ 224 /* case 0x000D: * CARRIAGE RETURN */ 225 0, 1, 1, 1, 1, 1, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 /* case 0x001C: * FILE SEPARATOR */ 228 /* case 0x001D: * GROUP SEPARATOR */ 229 /* case 0x001E: * RECORD SEPARATOR */ 230 /* case 0x001F: * UNIT SEPARATOR */ 231 0, 0, 0, 0, 1, 1, 1, 1, 232 /* case 0x0020: * SPACE */ 233 1, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0, 237 238 0, 0, 0, 0, 0, 0, 0, 0, 239 0, 0, 0, 0, 0, 0, 0, 0, 240 0, 0, 0, 0, 0, 0, 0, 0, 241 0, 0, 0, 0, 0, 0, 0, 0, 242 0, 0, 0, 0, 0, 0, 0, 0, 243 0, 0, 0, 0, 0, 0, 0, 0, 244 0, 0, 0, 0, 0, 0, 0, 0, 245 0, 0, 0, 0, 0, 0, 0, 0 246 }; 247 248 /* forward */ 249 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 250 static PyObject* get_latin1_char(unsigned char ch); 251 static int unicode_modifiable(PyObject *unicode); 252 253 254 static PyObject * 255 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 256 static PyObject * 257 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 258 static PyObject * 259 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 260 261 static PyObject * 262 unicode_encode_call_errorhandler(const char *errors, 263 PyObject **errorHandler,const char *encoding, const char *reason, 264 PyObject *unicode, PyObject **exceptionObject, 265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 266 267 static void 268 raise_encode_exception(PyObject **exceptionObject, 269 const char *encoding, 270 PyObject *unicode, 271 Py_ssize_t startpos, Py_ssize_t endpos, 272 const char *reason); 273 274 /* Same for linebreaks */ 275 static const unsigned char ascii_linebreak[] = { 276 0, 0, 0, 0, 0, 0, 0, 0, 277 /* 0x000A, * LINE FEED */ 278 /* 0x000B, * LINE TABULATION */ 279 /* 0x000C, * FORM FEED */ 280 /* 0x000D, * CARRIAGE RETURN */ 281 0, 0, 1, 1, 1, 1, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 /* 0x001C, * FILE SEPARATOR */ 284 /* 0x001D, * GROUP SEPARATOR */ 285 /* 0x001E, * RECORD SEPARATOR */ 286 0, 0, 0, 0, 1, 1, 1, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 292 0, 0, 0, 0, 0, 0, 0, 0, 293 0, 0, 0, 0, 0, 0, 0, 0, 294 0, 0, 0, 0, 0, 0, 0, 0, 295 0, 0, 0, 0, 0, 0, 0, 0, 296 0, 0, 0, 0, 0, 0, 0, 0, 297 0, 0, 0, 0, 0, 0, 0, 0, 298 0, 0, 0, 0, 0, 0, 0, 0, 299 0, 0, 0, 0, 0, 0, 0, 0 300 }; 301 302 #include "clinic/unicodeobject.c.h" 303 304 typedef enum { 305 _Py_ERROR_UNKNOWN=0, 306 _Py_ERROR_STRICT, 307 _Py_ERROR_SURROGATEESCAPE, 308 _Py_ERROR_REPLACE, 309 _Py_ERROR_IGNORE, 310 _Py_ERROR_BACKSLASHREPLACE, 311 _Py_ERROR_SURROGATEPASS, 312 _Py_ERROR_XMLCHARREFREPLACE, 313 _Py_ERROR_OTHER 314 } _Py_error_handler; 315 316 static _Py_error_handler 317 get_error_handler(const char *errors) 318 { 319 if (errors == NULL || strcmp(errors, "strict") == 0) { 320 return _Py_ERROR_STRICT; 321 } 322 if (strcmp(errors, "surrogateescape") == 0) { 323 return _Py_ERROR_SURROGATEESCAPE; 324 } 325 if (strcmp(errors, "replace") == 0) { 326 return _Py_ERROR_REPLACE; 327 } 328 if (strcmp(errors, "ignore") == 0) { 329 return _Py_ERROR_IGNORE; 330 } 331 if (strcmp(errors, "backslashreplace") == 0) { 332 return _Py_ERROR_BACKSLASHREPLACE; 333 } 334 if (strcmp(errors, "surrogatepass") == 0) { 335 return _Py_ERROR_SURROGATEPASS; 336 } 337 if (strcmp(errors, "xmlcharrefreplace") == 0) { 338 return _Py_ERROR_XMLCHARREFREPLACE; 339 } 340 return _Py_ERROR_OTHER; 341 } 342 343 /* The max unicode value is always 0x10FFFF while using the PEP-393 API. 344 This function is kept for backward compatibility with the old API. */ 345 Py_UNICODE 346 PyUnicode_GetMax(void) 347 { 348 #ifdef Py_UNICODE_WIDE 349 return 0x10FFFF; 350 #else 351 /* This is actually an illegal character, so it should 352 not be passed to unichr. */ 353 return 0xFFFF; 354 #endif 355 } 356 357 #ifdef Py_DEBUG 358 int 359 _PyUnicode_CheckConsistency(PyObject *op, int check_content) 360 { 361 PyASCIIObject *ascii; 362 unsigned int kind; 363 364 assert(PyUnicode_Check(op)); 365 366 ascii = (PyASCIIObject *)op; 367 kind = ascii->state.kind; 368 369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 370 assert(kind == PyUnicode_1BYTE_KIND); 371 assert(ascii->state.ready == 1); 372 } 373 else { 374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 375 void *data; 376 377 if (ascii->state.compact == 1) { 378 data = compact + 1; 379 assert(kind == PyUnicode_1BYTE_KIND 380 || kind == PyUnicode_2BYTE_KIND 381 || kind == PyUnicode_4BYTE_KIND); 382 assert(ascii->state.ascii == 0); 383 assert(ascii->state.ready == 1); 384 assert (compact->utf8 != data); 385 } 386 else { 387 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 388 389 data = unicode->data.any; 390 if (kind == PyUnicode_WCHAR_KIND) { 391 assert(ascii->length == 0); 392 assert(ascii->hash == -1); 393 assert(ascii->state.compact == 0); 394 assert(ascii->state.ascii == 0); 395 assert(ascii->state.ready == 0); 396 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 397 assert(ascii->wstr != NULL); 398 assert(data == NULL); 399 assert(compact->utf8 == NULL); 400 } 401 else { 402 assert(kind == PyUnicode_1BYTE_KIND 403 || kind == PyUnicode_2BYTE_KIND 404 || kind == PyUnicode_4BYTE_KIND); 405 assert(ascii->state.compact == 0); 406 assert(ascii->state.ready == 1); 407 assert(data != NULL); 408 if (ascii->state.ascii) { 409 assert (compact->utf8 == data); 410 assert (compact->utf8_length == ascii->length); 411 } 412 else 413 assert (compact->utf8 != data); 414 } 415 } 416 if (kind != PyUnicode_WCHAR_KIND) { 417 if ( 418 #if SIZEOF_WCHAR_T == 2 419 kind == PyUnicode_2BYTE_KIND 420 #else 421 kind == PyUnicode_4BYTE_KIND 422 #endif 423 ) 424 { 425 assert(ascii->wstr == data); 426 assert(compact->wstr_length == ascii->length); 427 } else 428 assert(ascii->wstr != data); 429 } 430 431 if (compact->utf8 == NULL) 432 assert(compact->utf8_length == 0); 433 if (ascii->wstr == NULL) 434 assert(compact->wstr_length == 0); 435 } 436 /* check that the best kind is used */ 437 if (check_content && kind != PyUnicode_WCHAR_KIND) 438 { 439 Py_ssize_t i; 440 Py_UCS4 maxchar = 0; 441 void *data; 442 Py_UCS4 ch; 443 444 data = PyUnicode_DATA(ascii); 445 for (i=0; i < ascii->length; i++) 446 { 447 ch = PyUnicode_READ(kind, data, i); 448 if (ch > maxchar) 449 maxchar = ch; 450 } 451 if (kind == PyUnicode_1BYTE_KIND) { 452 if (ascii->state.ascii == 0) { 453 assert(maxchar >= 128); 454 assert(maxchar <= 255); 455 } 456 else 457 assert(maxchar < 128); 458 } 459 else if (kind == PyUnicode_2BYTE_KIND) { 460 assert(maxchar >= 0x100); 461 assert(maxchar <= 0xFFFF); 462 } 463 else { 464 assert(maxchar >= 0x10000); 465 assert(maxchar <= MAX_UNICODE); 466 } 467 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 468 } 469 return 1; 470 } 471 #endif 472 473 static PyObject* 474 unicode_result_wchar(PyObject *unicode) 475 { 476 #ifndef Py_DEBUG 477 Py_ssize_t len; 478 479 len = _PyUnicode_WSTR_LENGTH(unicode); 480 if (len == 0) { 481 Py_DECREF(unicode); 482 _Py_RETURN_UNICODE_EMPTY(); 483 } 484 485 if (len == 1) { 486 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 487 if ((Py_UCS4)ch < 256) { 488 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 489 Py_DECREF(unicode); 490 return latin1_char; 491 } 492 } 493 494 if (_PyUnicode_Ready(unicode) < 0) { 495 Py_DECREF(unicode); 496 return NULL; 497 } 498 #else 499 assert(Py_REFCNT(unicode) == 1); 500 501 /* don't make the result ready in debug mode to ensure that the caller 502 makes the string ready before using it */ 503 assert(_PyUnicode_CheckConsistency(unicode, 1)); 504 #endif 505 return unicode; 506 } 507 508 static PyObject* 509 unicode_result_ready(PyObject *unicode) 510 { 511 Py_ssize_t length; 512 513 length = PyUnicode_GET_LENGTH(unicode); 514 if (length == 0) { 515 if (unicode != unicode_empty) { 516 Py_DECREF(unicode); 517 _Py_RETURN_UNICODE_EMPTY(); 518 } 519 return unicode_empty; 520 } 521 522 if (length == 1) { 523 void *data = PyUnicode_DATA(unicode); 524 int kind = PyUnicode_KIND(unicode); 525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 526 if (ch < 256) { 527 PyObject *latin1_char = unicode_latin1[ch]; 528 if (latin1_char != NULL) { 529 if (unicode != latin1_char) { 530 Py_INCREF(latin1_char); 531 Py_DECREF(unicode); 532 } 533 return latin1_char; 534 } 535 else { 536 assert(_PyUnicode_CheckConsistency(unicode, 1)); 537 Py_INCREF(unicode); 538 unicode_latin1[ch] = unicode; 539 return unicode; 540 } 541 } 542 } 543 544 assert(_PyUnicode_CheckConsistency(unicode, 1)); 545 return unicode; 546 } 547 548 static PyObject* 549 unicode_result(PyObject *unicode) 550 { 551 assert(_PyUnicode_CHECK(unicode)); 552 if (PyUnicode_IS_READY(unicode)) 553 return unicode_result_ready(unicode); 554 else 555 return unicode_result_wchar(unicode); 556 } 557 558 static PyObject* 559 unicode_result_unchanged(PyObject *unicode) 560 { 561 if (PyUnicode_CheckExact(unicode)) { 562 if (PyUnicode_READY(unicode) == -1) 563 return NULL; 564 Py_INCREF(unicode); 565 return unicode; 566 } 567 else 568 /* Subtype -- return genuine unicode string with the same value. */ 569 return _PyUnicode_Copy(unicode); 570 } 571 572 /* Implementation of the "backslashreplace" error handler for 8-bit encodings: 573 ASCII, Latin1, UTF-8, etc. */ 574 static char* 575 backslashreplace(_PyBytesWriter *writer, char *str, 576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 577 { 578 Py_ssize_t size, i; 579 Py_UCS4 ch; 580 enum PyUnicode_Kind kind; 581 void *data; 582 583 assert(PyUnicode_IS_READY(unicode)); 584 kind = PyUnicode_KIND(unicode); 585 data = PyUnicode_DATA(unicode); 586 587 size = 0; 588 /* determine replacement size */ 589 for (i = collstart; i < collend; ++i) { 590 Py_ssize_t incr; 591 592 ch = PyUnicode_READ(kind, data, i); 593 if (ch < 0x100) 594 incr = 2+2; 595 else if (ch < 0x10000) 596 incr = 2+4; 597 else { 598 assert(ch <= MAX_UNICODE); 599 incr = 2+8; 600 } 601 if (size > PY_SSIZE_T_MAX - incr) { 602 PyErr_SetString(PyExc_OverflowError, 603 "encoded result is too long for a Python string"); 604 return NULL; 605 } 606 size += incr; 607 } 608 609 str = _PyBytesWriter_Prepare(writer, str, size); 610 if (str == NULL) 611 return NULL; 612 613 /* generate replacement */ 614 for (i = collstart; i < collend; ++i) { 615 ch = PyUnicode_READ(kind, data, i); 616 *str++ = '\\'; 617 if (ch >= 0x00010000) { 618 *str++ = 'U'; 619 *str++ = Py_hexdigits[(ch>>28)&0xf]; 620 *str++ = Py_hexdigits[(ch>>24)&0xf]; 621 *str++ = Py_hexdigits[(ch>>20)&0xf]; 622 *str++ = Py_hexdigits[(ch>>16)&0xf]; 623 *str++ = Py_hexdigits[(ch>>12)&0xf]; 624 *str++ = Py_hexdigits[(ch>>8)&0xf]; 625 } 626 else if (ch >= 0x100) { 627 *str++ = 'u'; 628 *str++ = Py_hexdigits[(ch>>12)&0xf]; 629 *str++ = Py_hexdigits[(ch>>8)&0xf]; 630 } 631 else 632 *str++ = 'x'; 633 *str++ = Py_hexdigits[(ch>>4)&0xf]; 634 *str++ = Py_hexdigits[ch&0xf]; 635 } 636 return str; 637 } 638 639 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: 640 ASCII, Latin1, UTF-8, etc. */ 641 static char* 642 xmlcharrefreplace(_PyBytesWriter *writer, char *str, 643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 644 { 645 Py_ssize_t size, i; 646 Py_UCS4 ch; 647 enum PyUnicode_Kind kind; 648 void *data; 649 650 assert(PyUnicode_IS_READY(unicode)); 651 kind = PyUnicode_KIND(unicode); 652 data = PyUnicode_DATA(unicode); 653 654 size = 0; 655 /* determine replacement size */ 656 for (i = collstart; i < collend; ++i) { 657 Py_ssize_t incr; 658 659 ch = PyUnicode_READ(kind, data, i); 660 if (ch < 10) 661 incr = 2+1+1; 662 else if (ch < 100) 663 incr = 2+2+1; 664 else if (ch < 1000) 665 incr = 2+3+1; 666 else if (ch < 10000) 667 incr = 2+4+1; 668 else if (ch < 100000) 669 incr = 2+5+1; 670 else if (ch < 1000000) 671 incr = 2+6+1; 672 else { 673 assert(ch <= MAX_UNICODE); 674 incr = 2+7+1; 675 } 676 if (size > PY_SSIZE_T_MAX - incr) { 677 PyErr_SetString(PyExc_OverflowError, 678 "encoded result is too long for a Python string"); 679 return NULL; 680 } 681 size += incr; 682 } 683 684 str = _PyBytesWriter_Prepare(writer, str, size); 685 if (str == NULL) 686 return NULL; 687 688 /* generate replacement */ 689 for (i = collstart; i < collend; ++i) { 690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 691 } 692 return str; 693 } 694 695 /* --- Bloom Filters ----------------------------------------------------- */ 696 697 /* stuff to implement simple "bloom filters" for Unicode characters. 698 to keep things simple, we use a single bitmask, using the least 5 699 bits from each unicode characters as the bit index. */ 700 701 /* the linebreak mask is set up by Unicode_Init below */ 702 703 #if LONG_BIT >= 128 704 #define BLOOM_WIDTH 128 705 #elif LONG_BIT >= 64 706 #define BLOOM_WIDTH 64 707 #elif LONG_BIT >= 32 708 #define BLOOM_WIDTH 32 709 #else 710 #error "LONG_BIT is smaller than 32" 711 #endif 712 713 #define BLOOM_MASK unsigned long 714 715 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 716 717 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 718 719 #define BLOOM_LINEBREAK(ch) \ 720 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 722 723 static inline BLOOM_MASK 724 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 725 { 726 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 727 do { \ 728 TYPE *data = (TYPE *)PTR; \ 729 TYPE *end = data + LEN; \ 730 Py_UCS4 ch; \ 731 for (; data != end; data++) { \ 732 ch = *data; \ 733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 734 } \ 735 break; \ 736 } while (0) 737 738 /* calculate simple bloom-style bitmask for a given unicode string */ 739 740 BLOOM_MASK mask; 741 742 mask = 0; 743 switch (kind) { 744 case PyUnicode_1BYTE_KIND: 745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 746 break; 747 case PyUnicode_2BYTE_KIND: 748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 749 break; 750 case PyUnicode_4BYTE_KIND: 751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 752 break; 753 default: 754 assert(0); 755 } 756 return mask; 757 758 #undef BLOOM_UPDATE 759 } 760 761 static int 762 ensure_unicode(PyObject *obj) 763 { 764 if (!PyUnicode_Check(obj)) { 765 PyErr_Format(PyExc_TypeError, 766 "must be str, not %.100s", 767 Py_TYPE(obj)->tp_name); 768 return -1; 769 } 770 return PyUnicode_READY(obj); 771 } 772 773 /* Compilation of templated routines */ 774 775 #include "stringlib/asciilib.h" 776 #include "stringlib/fastsearch.h" 777 #include "stringlib/partition.h" 778 #include "stringlib/split.h" 779 #include "stringlib/count.h" 780 #include "stringlib/find.h" 781 #include "stringlib/find_max_char.h" 782 #include "stringlib/localeutil.h" 783 #include "stringlib/undef.h" 784 785 #include "stringlib/ucs1lib.h" 786 #include "stringlib/fastsearch.h" 787 #include "stringlib/partition.h" 788 #include "stringlib/split.h" 789 #include "stringlib/count.h" 790 #include "stringlib/find.h" 791 #include "stringlib/replace.h" 792 #include "stringlib/find_max_char.h" 793 #include "stringlib/localeutil.h" 794 #include "stringlib/undef.h" 795 796 #include "stringlib/ucs2lib.h" 797 #include "stringlib/fastsearch.h" 798 #include "stringlib/partition.h" 799 #include "stringlib/split.h" 800 #include "stringlib/count.h" 801 #include "stringlib/find.h" 802 #include "stringlib/replace.h" 803 #include "stringlib/find_max_char.h" 804 #include "stringlib/localeutil.h" 805 #include "stringlib/undef.h" 806 807 #include "stringlib/ucs4lib.h" 808 #include "stringlib/fastsearch.h" 809 #include "stringlib/partition.h" 810 #include "stringlib/split.h" 811 #include "stringlib/count.h" 812 #include "stringlib/find.h" 813 #include "stringlib/replace.h" 814 #include "stringlib/find_max_char.h" 815 #include "stringlib/localeutil.h" 816 #include "stringlib/undef.h" 817 818 #include "stringlib/unicodedefs.h" 819 #include "stringlib/fastsearch.h" 820 #include "stringlib/count.h" 821 #include "stringlib/find.h" 822 #include "stringlib/undef.h" 823 824 /* --- Unicode Object ----------------------------------------------------- */ 825 826 static PyObject * 827 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 828 829 static inline Py_ssize_t 830 findchar(const void *s, int kind, 831 Py_ssize_t size, Py_UCS4 ch, 832 int direction) 833 { 834 switch (kind) { 835 case PyUnicode_1BYTE_KIND: 836 if ((Py_UCS1) ch != ch) 837 return -1; 838 if (direction > 0) 839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 840 else 841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 842 case PyUnicode_2BYTE_KIND: 843 if ((Py_UCS2) ch != ch) 844 return -1; 845 if (direction > 0) 846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 847 else 848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 849 case PyUnicode_4BYTE_KIND: 850 if (direction > 0) 851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch); 852 else 853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); 854 default: 855 assert(0); 856 return -1; 857 } 858 } 859 860 #ifdef Py_DEBUG 861 /* Fill the data of a Unicode string with invalid characters to detect bugs 862 earlier. 863 864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 866 invalid character in Unicode 6.0. */ 867 static void 868 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 869 { 870 int kind = PyUnicode_KIND(unicode); 871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 872 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 873 if (length <= old_length) 874 return; 875 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 876 } 877 #endif 878 879 static PyObject* 880 resize_compact(PyObject *unicode, Py_ssize_t length) 881 { 882 Py_ssize_t char_size; 883 Py_ssize_t struct_size; 884 Py_ssize_t new_size; 885 int share_wstr; 886 PyObject *new_unicode; 887 #ifdef Py_DEBUG 888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 889 #endif 890 891 assert(unicode_modifiable(unicode)); 892 assert(PyUnicode_IS_READY(unicode)); 893 assert(PyUnicode_IS_COMPACT(unicode)); 894 895 char_size = PyUnicode_KIND(unicode); 896 if (PyUnicode_IS_ASCII(unicode)) 897 struct_size = sizeof(PyASCIIObject); 898 else 899 struct_size = sizeof(PyCompactUnicodeObject); 900 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 901 902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 903 PyErr_NoMemory(); 904 return NULL; 905 } 906 new_size = (struct_size + (length + 1) * char_size); 907 908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { 909 PyObject_DEL(_PyUnicode_UTF8(unicode)); 910 _PyUnicode_UTF8(unicode) = NULL; 911 _PyUnicode_UTF8_LENGTH(unicode) = 0; 912 } 913 _Py_DEC_REFTOTAL; 914 _Py_ForgetReference(unicode); 915 916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 917 if (new_unicode == NULL) { 918 _Py_NewReference(unicode); 919 PyErr_NoMemory(); 920 return NULL; 921 } 922 unicode = new_unicode; 923 _Py_NewReference(unicode); 924 925 _PyUnicode_LENGTH(unicode) = length; 926 if (share_wstr) { 927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 928 if (!PyUnicode_IS_ASCII(unicode)) 929 _PyUnicode_WSTR_LENGTH(unicode) = length; 930 } 931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 932 PyObject_DEL(_PyUnicode_WSTR(unicode)); 933 _PyUnicode_WSTR(unicode) = NULL; 934 if (!PyUnicode_IS_ASCII(unicode)) 935 _PyUnicode_WSTR_LENGTH(unicode) = 0; 936 } 937 #ifdef Py_DEBUG 938 unicode_fill_invalid(unicode, old_length); 939 #endif 940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 941 length, 0); 942 assert(_PyUnicode_CheckConsistency(unicode, 0)); 943 return unicode; 944 } 945 946 static int 947 resize_inplace(PyObject *unicode, Py_ssize_t length) 948 { 949 wchar_t *wstr; 950 Py_ssize_t new_size; 951 assert(!PyUnicode_IS_COMPACT(unicode)); 952 assert(Py_REFCNT(unicode) == 1); 953 954 if (PyUnicode_IS_READY(unicode)) { 955 Py_ssize_t char_size; 956 int share_wstr, share_utf8; 957 void *data; 958 #ifdef Py_DEBUG 959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 960 #endif 961 962 data = _PyUnicode_DATA_ANY(unicode); 963 char_size = PyUnicode_KIND(unicode); 964 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 966 967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 968 PyErr_NoMemory(); 969 return -1; 970 } 971 new_size = (length + 1) * char_size; 972 973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 974 { 975 PyObject_DEL(_PyUnicode_UTF8(unicode)); 976 _PyUnicode_UTF8(unicode) = NULL; 977 _PyUnicode_UTF8_LENGTH(unicode) = 0; 978 } 979 980 data = (PyObject *)PyObject_REALLOC(data, new_size); 981 if (data == NULL) { 982 PyErr_NoMemory(); 983 return -1; 984 } 985 _PyUnicode_DATA_ANY(unicode) = data; 986 if (share_wstr) { 987 _PyUnicode_WSTR(unicode) = data; 988 _PyUnicode_WSTR_LENGTH(unicode) = length; 989 } 990 if (share_utf8) { 991 _PyUnicode_UTF8(unicode) = data; 992 _PyUnicode_UTF8_LENGTH(unicode) = length; 993 } 994 _PyUnicode_LENGTH(unicode) = length; 995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 996 #ifdef Py_DEBUG 997 unicode_fill_invalid(unicode, old_length); 998 #endif 999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 1000 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1001 return 0; 1002 } 1003 } 1004 assert(_PyUnicode_WSTR(unicode) != NULL); 1005 1006 /* check for integer overflow */ 1007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 1008 PyErr_NoMemory(); 1009 return -1; 1010 } 1011 new_size = sizeof(wchar_t) * (length + 1); 1012 wstr = _PyUnicode_WSTR(unicode); 1013 wstr = PyObject_REALLOC(wstr, new_size); 1014 if (!wstr) { 1015 PyErr_NoMemory(); 1016 return -1; 1017 } 1018 _PyUnicode_WSTR(unicode) = wstr; 1019 _PyUnicode_WSTR(unicode)[length] = 0; 1020 _PyUnicode_WSTR_LENGTH(unicode) = length; 1021 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1022 return 0; 1023 } 1024 1025 static PyObject* 1026 resize_copy(PyObject *unicode, Py_ssize_t length) 1027 { 1028 Py_ssize_t copy_length; 1029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 1030 PyObject *copy; 1031 1032 if (PyUnicode_READY(unicode) == -1) 1033 return NULL; 1034 1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 1036 if (copy == NULL) 1037 return NULL; 1038 1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 1040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 1041 return copy; 1042 } 1043 else { 1044 PyObject *w; 1045 1046 w = (PyObject*)_PyUnicode_New(length); 1047 if (w == NULL) 1048 return NULL; 1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 1050 copy_length = Py_MIN(copy_length, length); 1051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 1052 copy_length * sizeof(wchar_t)); 1053 return w; 1054 } 1055 } 1056 1057 /* We allocate one more byte to make sure the string is 1058 Ux0000 terminated; some code (e.g. new_identifier) 1059 relies on that. 1060 1061 XXX This allocator could further be enhanced by assuring that the 1062 free list never reduces its size below 1. 1063 1064 */ 1065 1066 static PyUnicodeObject * 1067 _PyUnicode_New(Py_ssize_t length) 1068 { 1069 PyUnicodeObject *unicode; 1070 size_t new_size; 1071 1072 /* Optimization for empty strings */ 1073 if (length == 0 && unicode_empty != NULL) { 1074 Py_INCREF(unicode_empty); 1075 return (PyUnicodeObject*)unicode_empty; 1076 } 1077 1078 /* Ensure we won't overflow the size. */ 1079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 1080 return (PyUnicodeObject *)PyErr_NoMemory(); 1081 } 1082 if (length < 0) { 1083 PyErr_SetString(PyExc_SystemError, 1084 "Negative size passed to _PyUnicode_New"); 1085 return NULL; 1086 } 1087 1088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 1089 if (unicode == NULL) 1090 return NULL; 1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 1092 1093 _PyUnicode_WSTR_LENGTH(unicode) = length; 1094 _PyUnicode_HASH(unicode) = -1; 1095 _PyUnicode_STATE(unicode).interned = 0; 1096 _PyUnicode_STATE(unicode).kind = 0; 1097 _PyUnicode_STATE(unicode).compact = 0; 1098 _PyUnicode_STATE(unicode).ready = 0; 1099 _PyUnicode_STATE(unicode).ascii = 0; 1100 _PyUnicode_DATA_ANY(unicode) = NULL; 1101 _PyUnicode_LENGTH(unicode) = 0; 1102 _PyUnicode_UTF8(unicode) = NULL; 1103 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1104 1105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 1106 if (!_PyUnicode_WSTR(unicode)) { 1107 Py_DECREF(unicode); 1108 PyErr_NoMemory(); 1109 return NULL; 1110 } 1111 1112 /* Initialize the first element to guard against cases where 1113 * the caller fails before initializing str -- unicode_resize() 1114 * reads str[0], and the Keep-Alive optimization can keep memory 1115 * allocated for str alive across a call to unicode_dealloc(unicode). 1116 * We don't want unicode_resize to read uninitialized memory in 1117 * that case. 1118 */ 1119 _PyUnicode_WSTR(unicode)[0] = 0; 1120 _PyUnicode_WSTR(unicode)[length] = 0; 1121 1122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 1123 return unicode; 1124 } 1125 1126 static const char* 1127 unicode_kind_name(PyObject *unicode) 1128 { 1129 /* don't check consistency: unicode_kind_name() is called from 1130 _PyUnicode_Dump() */ 1131 if (!PyUnicode_IS_COMPACT(unicode)) 1132 { 1133 if (!PyUnicode_IS_READY(unicode)) 1134 return "wstr"; 1135 switch (PyUnicode_KIND(unicode)) 1136 { 1137 case PyUnicode_1BYTE_KIND: 1138 if (PyUnicode_IS_ASCII(unicode)) 1139 return "legacy ascii"; 1140 else 1141 return "legacy latin1"; 1142 case PyUnicode_2BYTE_KIND: 1143 return "legacy UCS2"; 1144 case PyUnicode_4BYTE_KIND: 1145 return "legacy UCS4"; 1146 default: 1147 return "<legacy invalid kind>"; 1148 } 1149 } 1150 assert(PyUnicode_IS_READY(unicode)); 1151 switch (PyUnicode_KIND(unicode)) { 1152 case PyUnicode_1BYTE_KIND: 1153 if (PyUnicode_IS_ASCII(unicode)) 1154 return "ascii"; 1155 else 1156 return "latin1"; 1157 case PyUnicode_2BYTE_KIND: 1158 return "UCS2"; 1159 case PyUnicode_4BYTE_KIND: 1160 return "UCS4"; 1161 default: 1162 return "<invalid compact kind>"; 1163 } 1164 } 1165 1166 #ifdef Py_DEBUG 1167 /* Functions wrapping macros for use in debugger */ 1168 char *_PyUnicode_utf8(void *unicode){ 1169 return PyUnicode_UTF8(unicode); 1170 } 1171 1172 void *_PyUnicode_compact_data(void *unicode) { 1173 return _PyUnicode_COMPACT_DATA(unicode); 1174 } 1175 void *_PyUnicode_data(void *unicode){ 1176 printf("obj %p\n", unicode); 1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1182 return PyUnicode_DATA(unicode); 1183 } 1184 1185 void 1186 _PyUnicode_Dump(PyObject *op) 1187 { 1188 PyASCIIObject *ascii = (PyASCIIObject *)op; 1189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1191 void *data; 1192 1193 if (ascii->state.compact) 1194 { 1195 if (ascii->state.ascii) 1196 data = (ascii + 1); 1197 else 1198 data = (compact + 1); 1199 } 1200 else 1201 data = unicode->data.any; 1202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1203 unicode_kind_name(op), ascii->length); 1204 1205 if (ascii->wstr == data) 1206 printf("shared "); 1207 printf("wstr=%p", ascii->wstr); 1208 1209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1211 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1212 printf("shared "); 1213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1214 compact->utf8, compact->utf8_length); 1215 } 1216 printf(", data=%p\n", data); 1217 } 1218 #endif 1219 1220 PyObject * 1221 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1222 { 1223 PyObject *obj; 1224 PyCompactUnicodeObject *unicode; 1225 void *data; 1226 enum PyUnicode_Kind kind; 1227 int is_sharing, is_ascii; 1228 Py_ssize_t char_size; 1229 Py_ssize_t struct_size; 1230 1231 /* Optimization for empty strings */ 1232 if (size == 0 && unicode_empty != NULL) { 1233 Py_INCREF(unicode_empty); 1234 return unicode_empty; 1235 } 1236 1237 is_ascii = 0; 1238 is_sharing = 0; 1239 struct_size = sizeof(PyCompactUnicodeObject); 1240 if (maxchar < 128) { 1241 kind = PyUnicode_1BYTE_KIND; 1242 char_size = 1; 1243 is_ascii = 1; 1244 struct_size = sizeof(PyASCIIObject); 1245 } 1246 else if (maxchar < 256) { 1247 kind = PyUnicode_1BYTE_KIND; 1248 char_size = 1; 1249 } 1250 else if (maxchar < 65536) { 1251 kind = PyUnicode_2BYTE_KIND; 1252 char_size = 2; 1253 if (sizeof(wchar_t) == 2) 1254 is_sharing = 1; 1255 } 1256 else { 1257 if (maxchar > MAX_UNICODE) { 1258 PyErr_SetString(PyExc_SystemError, 1259 "invalid maximum character passed to PyUnicode_New"); 1260 return NULL; 1261 } 1262 kind = PyUnicode_4BYTE_KIND; 1263 char_size = 4; 1264 if (sizeof(wchar_t) == 4) 1265 is_sharing = 1; 1266 } 1267 1268 /* Ensure we won't overflow the size. */ 1269 if (size < 0) { 1270 PyErr_SetString(PyExc_SystemError, 1271 "Negative size passed to PyUnicode_New"); 1272 return NULL; 1273 } 1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1275 return PyErr_NoMemory(); 1276 1277 /* Duplicated allocation code from _PyObject_New() instead of a call to 1278 * PyObject_New() so we are able to allocate space for the object and 1279 * it's data buffer. 1280 */ 1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1282 if (obj == NULL) 1283 return PyErr_NoMemory(); 1284 obj = PyObject_INIT(obj, &PyUnicode_Type); 1285 if (obj == NULL) 1286 return NULL; 1287 1288 unicode = (PyCompactUnicodeObject *)obj; 1289 if (is_ascii) 1290 data = ((PyASCIIObject*)obj) + 1; 1291 else 1292 data = unicode + 1; 1293 _PyUnicode_LENGTH(unicode) = size; 1294 _PyUnicode_HASH(unicode) = -1; 1295 _PyUnicode_STATE(unicode).interned = 0; 1296 _PyUnicode_STATE(unicode).kind = kind; 1297 _PyUnicode_STATE(unicode).compact = 1; 1298 _PyUnicode_STATE(unicode).ready = 1; 1299 _PyUnicode_STATE(unicode).ascii = is_ascii; 1300 if (is_ascii) { 1301 ((char*)data)[size] = 0; 1302 _PyUnicode_WSTR(unicode) = NULL; 1303 } 1304 else if (kind == PyUnicode_1BYTE_KIND) { 1305 ((char*)data)[size] = 0; 1306 _PyUnicode_WSTR(unicode) = NULL; 1307 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1308 unicode->utf8 = NULL; 1309 unicode->utf8_length = 0; 1310 } 1311 else { 1312 unicode->utf8 = NULL; 1313 unicode->utf8_length = 0; 1314 if (kind == PyUnicode_2BYTE_KIND) 1315 ((Py_UCS2*)data)[size] = 0; 1316 else /* kind == PyUnicode_4BYTE_KIND */ 1317 ((Py_UCS4*)data)[size] = 0; 1318 if (is_sharing) { 1319 _PyUnicode_WSTR_LENGTH(unicode) = size; 1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1321 } 1322 else { 1323 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1324 _PyUnicode_WSTR(unicode) = NULL; 1325 } 1326 } 1327 #ifdef Py_DEBUG 1328 unicode_fill_invalid((PyObject*)unicode, 0); 1329 #endif 1330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1331 return obj; 1332 } 1333 1334 #if SIZEOF_WCHAR_T == 2 1335 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1336 will decode surrogate pairs, the other conversions are implemented as macros 1337 for efficiency. 1338 1339 This function assumes that unicode can hold one more code point than wstr 1340 characters for a terminating null character. */ 1341 static void 1342 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1343 PyObject *unicode) 1344 { 1345 const wchar_t *iter; 1346 Py_UCS4 *ucs4_out; 1347 1348 assert(unicode != NULL); 1349 assert(_PyUnicode_CHECK(unicode)); 1350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1352 1353 for (iter = begin; iter < end; ) { 1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1355 _PyUnicode_GET_LENGTH(unicode))); 1356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1357 && (iter+1) < end 1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1359 { 1360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1361 iter += 2; 1362 } 1363 else { 1364 *ucs4_out++ = *iter; 1365 iter++; 1366 } 1367 } 1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1369 _PyUnicode_GET_LENGTH(unicode))); 1370 1371 } 1372 #endif 1373 1374 static int 1375 unicode_check_modifiable(PyObject *unicode) 1376 { 1377 if (!unicode_modifiable(unicode)) { 1378 PyErr_SetString(PyExc_SystemError, 1379 "Cannot modify a string currently used"); 1380 return -1; 1381 } 1382 return 0; 1383 } 1384 1385 static int 1386 _copy_characters(PyObject *to, Py_ssize_t to_start, 1387 PyObject *from, Py_ssize_t from_start, 1388 Py_ssize_t how_many, int check_maxchar) 1389 { 1390 unsigned int from_kind, to_kind; 1391 void *from_data, *to_data; 1392 1393 assert(0 <= how_many); 1394 assert(0 <= from_start); 1395 assert(0 <= to_start); 1396 assert(PyUnicode_Check(from)); 1397 assert(PyUnicode_IS_READY(from)); 1398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1399 1400 assert(PyUnicode_Check(to)); 1401 assert(PyUnicode_IS_READY(to)); 1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1403 1404 if (how_many == 0) 1405 return 0; 1406 1407 from_kind = PyUnicode_KIND(from); 1408 from_data = PyUnicode_DATA(from); 1409 to_kind = PyUnicode_KIND(to); 1410 to_data = PyUnicode_DATA(to); 1411 1412 #ifdef Py_DEBUG 1413 if (!check_maxchar 1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1415 { 1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1417 Py_UCS4 ch; 1418 Py_ssize_t i; 1419 for (i=0; i < how_many; i++) { 1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1421 assert(ch <= to_maxchar); 1422 } 1423 } 1424 #endif 1425 1426 if (from_kind == to_kind) { 1427 if (check_maxchar 1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1429 { 1430 /* Writing Latin-1 characters into an ASCII string requires to 1431 check that all written characters are pure ASCII */ 1432 Py_UCS4 max_char; 1433 max_char = ucs1lib_find_max_char(from_data, 1434 (Py_UCS1*)from_data + how_many); 1435 if (max_char >= 128) 1436 return -1; 1437 } 1438 memcpy((char*)to_data + to_kind * to_start, 1439 (char*)from_data + from_kind * from_start, 1440 to_kind * how_many); 1441 } 1442 else if (from_kind == PyUnicode_1BYTE_KIND 1443 && to_kind == PyUnicode_2BYTE_KIND) 1444 { 1445 _PyUnicode_CONVERT_BYTES( 1446 Py_UCS1, Py_UCS2, 1447 PyUnicode_1BYTE_DATA(from) + from_start, 1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1449 PyUnicode_2BYTE_DATA(to) + to_start 1450 ); 1451 } 1452 else if (from_kind == PyUnicode_1BYTE_KIND 1453 && to_kind == PyUnicode_4BYTE_KIND) 1454 { 1455 _PyUnicode_CONVERT_BYTES( 1456 Py_UCS1, Py_UCS4, 1457 PyUnicode_1BYTE_DATA(from) + from_start, 1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1459 PyUnicode_4BYTE_DATA(to) + to_start 1460 ); 1461 } 1462 else if (from_kind == PyUnicode_2BYTE_KIND 1463 && to_kind == PyUnicode_4BYTE_KIND) 1464 { 1465 _PyUnicode_CONVERT_BYTES( 1466 Py_UCS2, Py_UCS4, 1467 PyUnicode_2BYTE_DATA(from) + from_start, 1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1469 PyUnicode_4BYTE_DATA(to) + to_start 1470 ); 1471 } 1472 else { 1473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1474 1475 if (!check_maxchar) { 1476 if (from_kind == PyUnicode_2BYTE_KIND 1477 && to_kind == PyUnicode_1BYTE_KIND) 1478 { 1479 _PyUnicode_CONVERT_BYTES( 1480 Py_UCS2, Py_UCS1, 1481 PyUnicode_2BYTE_DATA(from) + from_start, 1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1483 PyUnicode_1BYTE_DATA(to) + to_start 1484 ); 1485 } 1486 else if (from_kind == PyUnicode_4BYTE_KIND 1487 && to_kind == PyUnicode_1BYTE_KIND) 1488 { 1489 _PyUnicode_CONVERT_BYTES( 1490 Py_UCS4, Py_UCS1, 1491 PyUnicode_4BYTE_DATA(from) + from_start, 1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1493 PyUnicode_1BYTE_DATA(to) + to_start 1494 ); 1495 } 1496 else if (from_kind == PyUnicode_4BYTE_KIND 1497 && to_kind == PyUnicode_2BYTE_KIND) 1498 { 1499 _PyUnicode_CONVERT_BYTES( 1500 Py_UCS4, Py_UCS2, 1501 PyUnicode_4BYTE_DATA(from) + from_start, 1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1503 PyUnicode_2BYTE_DATA(to) + to_start 1504 ); 1505 } 1506 else { 1507 assert(0); 1508 return -1; 1509 } 1510 } 1511 else { 1512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1513 Py_UCS4 ch; 1514 Py_ssize_t i; 1515 1516 for (i=0; i < how_many; i++) { 1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1518 if (ch > to_maxchar) 1519 return -1; 1520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1521 } 1522 } 1523 } 1524 return 0; 1525 } 1526 1527 void 1528 _PyUnicode_FastCopyCharacters( 1529 PyObject *to, Py_ssize_t to_start, 1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1531 { 1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1533 } 1534 1535 Py_ssize_t 1536 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1537 PyObject *from, Py_ssize_t from_start, 1538 Py_ssize_t how_many) 1539 { 1540 int err; 1541 1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1543 PyErr_BadInternalCall(); 1544 return -1; 1545 } 1546 1547 if (PyUnicode_READY(from) == -1) 1548 return -1; 1549 if (PyUnicode_READY(to) == -1) 1550 return -1; 1551 1552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { 1553 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1554 return -1; 1555 } 1556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { 1557 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1558 return -1; 1559 } 1560 if (how_many < 0) { 1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); 1562 return -1; 1563 } 1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); 1565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1566 PyErr_Format(PyExc_SystemError, 1567 "Cannot write %zi characters at %zi " 1568 "in a string of %zi characters", 1569 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1570 return -1; 1571 } 1572 1573 if (how_many == 0) 1574 return 0; 1575 1576 if (unicode_check_modifiable(to)) 1577 return -1; 1578 1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1580 if (err) { 1581 PyErr_Format(PyExc_SystemError, 1582 "Cannot copy %s characters " 1583 "into a string of %s characters", 1584 unicode_kind_name(from), 1585 unicode_kind_name(to)); 1586 return -1; 1587 } 1588 return how_many; 1589 } 1590 1591 /* Find the maximum code point and count the number of surrogate pairs so a 1592 correct string length can be computed before converting a string to UCS4. 1593 This function counts single surrogates as a character and not as a pair. 1594 1595 Return 0 on success, or -1 on error. */ 1596 static int 1597 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1599 { 1600 const wchar_t *iter; 1601 Py_UCS4 ch; 1602 1603 assert(num_surrogates != NULL && maxchar != NULL); 1604 *num_surrogates = 0; 1605 *maxchar = 0; 1606 1607 for (iter = begin; iter < end; ) { 1608 #if SIZEOF_WCHAR_T == 2 1609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1610 && (iter+1) < end 1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1612 { 1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1614 ++(*num_surrogates); 1615 iter += 2; 1616 } 1617 else 1618 #endif 1619 { 1620 ch = *iter; 1621 iter++; 1622 } 1623 if (ch > *maxchar) { 1624 *maxchar = ch; 1625 if (*maxchar > MAX_UNICODE) { 1626 PyErr_Format(PyExc_ValueError, 1627 "character U+%x is not in range [U+0000; U+10ffff]", 1628 ch); 1629 return -1; 1630 } 1631 } 1632 } 1633 return 0; 1634 } 1635 1636 int 1637 _PyUnicode_Ready(PyObject *unicode) 1638 { 1639 wchar_t *end; 1640 Py_UCS4 maxchar = 0; 1641 Py_ssize_t num_surrogates; 1642 #if SIZEOF_WCHAR_T == 2 1643 Py_ssize_t length_wo_surrogates; 1644 #endif 1645 1646 /* _PyUnicode_Ready() is only intended for old-style API usage where 1647 strings were created using _PyObject_New() and where no canonical 1648 representation (the str field) has been set yet aka strings 1649 which are not yet ready. */ 1650 assert(_PyUnicode_CHECK(unicode)); 1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1652 assert(_PyUnicode_WSTR(unicode) != NULL); 1653 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1654 assert(_PyUnicode_UTF8(unicode) == NULL); 1655 /* Actually, it should neither be interned nor be anything else: */ 1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1657 1658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1660 &maxchar, &num_surrogates) == -1) 1661 return -1; 1662 1663 if (maxchar < 256) { 1664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1665 if (!_PyUnicode_DATA_ANY(unicode)) { 1666 PyErr_NoMemory(); 1667 return -1; 1668 } 1669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1670 _PyUnicode_WSTR(unicode), end, 1671 PyUnicode_1BYTE_DATA(unicode)); 1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1675 if (maxchar < 128) { 1676 _PyUnicode_STATE(unicode).ascii = 1; 1677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1679 } 1680 else { 1681 _PyUnicode_STATE(unicode).ascii = 0; 1682 _PyUnicode_UTF8(unicode) = NULL; 1683 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1684 } 1685 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1686 _PyUnicode_WSTR(unicode) = NULL; 1687 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1688 } 1689 /* In this case we might have to convert down from 4-byte native 1690 wchar_t to 2-byte unicode. */ 1691 else if (maxchar < 65536) { 1692 assert(num_surrogates == 0 && 1693 "FindMaxCharAndNumSurrogatePairs() messed up"); 1694 1695 #if SIZEOF_WCHAR_T == 2 1696 /* We can share representations and are done. */ 1697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1701 _PyUnicode_UTF8(unicode) = NULL; 1702 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1703 #else 1704 /* sizeof(wchar_t) == 4 */ 1705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1707 if (!_PyUnicode_DATA_ANY(unicode)) { 1708 PyErr_NoMemory(); 1709 return -1; 1710 } 1711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1712 _PyUnicode_WSTR(unicode), end, 1713 PyUnicode_2BYTE_DATA(unicode)); 1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1717 _PyUnicode_UTF8(unicode) = NULL; 1718 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1719 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1720 _PyUnicode_WSTR(unicode) = NULL; 1721 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1722 #endif 1723 } 1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1725 else { 1726 #if SIZEOF_WCHAR_T == 2 1727 /* in case the native representation is 2-bytes, we need to allocate a 1728 new normalized 4-byte version. */ 1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1731 PyErr_NoMemory(); 1732 return -1; 1733 } 1734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1735 if (!_PyUnicode_DATA_ANY(unicode)) { 1736 PyErr_NoMemory(); 1737 return -1; 1738 } 1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1741 _PyUnicode_UTF8(unicode) = NULL; 1742 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1743 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1744 _PyUnicode_STATE(unicode).ready = 1; 1745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1746 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1747 _PyUnicode_WSTR(unicode) = NULL; 1748 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1749 #else 1750 assert(num_surrogates == 0); 1751 1752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1754 _PyUnicode_UTF8(unicode) = NULL; 1755 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1757 #endif 1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1759 } 1760 _PyUnicode_STATE(unicode).ready = 1; 1761 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1762 return 0; 1763 } 1764 1765 static void 1766 unicode_dealloc(PyObject *unicode) 1767 { 1768 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1769 case SSTATE_NOT_INTERNED: 1770 break; 1771 1772 case SSTATE_INTERNED_MORTAL: 1773 /* revive dead object temporarily for DelItem */ 1774 Py_REFCNT(unicode) = 3; 1775 if (PyDict_DelItem(interned, unicode) != 0) 1776 Py_FatalError( 1777 "deletion of interned string failed"); 1778 break; 1779 1780 case SSTATE_INTERNED_IMMORTAL: 1781 Py_FatalError("Immortal interned string died."); 1782 1783 default: 1784 Py_FatalError("Inconsistent interned string state."); 1785 } 1786 1787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1788 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1790 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1793 1794 Py_TYPE(unicode)->tp_free(unicode); 1795 } 1796 1797 #ifdef Py_DEBUG 1798 static int 1799 unicode_is_singleton(PyObject *unicode) 1800 { 1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1802 if (unicode == unicode_empty) 1803 return 1; 1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1805 { 1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1807 if (ch < 256 && unicode_latin1[ch] == unicode) 1808 return 1; 1809 } 1810 return 0; 1811 } 1812 #endif 1813 1814 static int 1815 unicode_modifiable(PyObject *unicode) 1816 { 1817 assert(_PyUnicode_CHECK(unicode)); 1818 if (Py_REFCNT(unicode) != 1) 1819 return 0; 1820 if (_PyUnicode_HASH(unicode) != -1) 1821 return 0; 1822 if (PyUnicode_CHECK_INTERNED(unicode)) 1823 return 0; 1824 if (!PyUnicode_CheckExact(unicode)) 1825 return 0; 1826 #ifdef Py_DEBUG 1827 /* singleton refcount is greater than 1 */ 1828 assert(!unicode_is_singleton(unicode)); 1829 #endif 1830 return 1; 1831 } 1832 1833 static int 1834 unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1835 { 1836 PyObject *unicode; 1837 Py_ssize_t old_length; 1838 1839 assert(p_unicode != NULL); 1840 unicode = *p_unicode; 1841 1842 assert(unicode != NULL); 1843 assert(PyUnicode_Check(unicode)); 1844 assert(0 <= length); 1845 1846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1847 old_length = PyUnicode_WSTR_LENGTH(unicode); 1848 else 1849 old_length = PyUnicode_GET_LENGTH(unicode); 1850 if (old_length == length) 1851 return 0; 1852 1853 if (length == 0) { 1854 _Py_INCREF_UNICODE_EMPTY(); 1855 if (!unicode_empty) 1856 return -1; 1857 Py_SETREF(*p_unicode, unicode_empty); 1858 return 0; 1859 } 1860 1861 if (!unicode_modifiable(unicode)) { 1862 PyObject *copy = resize_copy(unicode, length); 1863 if (copy == NULL) 1864 return -1; 1865 Py_SETREF(*p_unicode, copy); 1866 return 0; 1867 } 1868 1869 if (PyUnicode_IS_COMPACT(unicode)) { 1870 PyObject *new_unicode = resize_compact(unicode, length); 1871 if (new_unicode == NULL) 1872 return -1; 1873 *p_unicode = new_unicode; 1874 return 0; 1875 } 1876 return resize_inplace(unicode, length); 1877 } 1878 1879 int 1880 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1881 { 1882 PyObject *unicode; 1883 if (p_unicode == NULL) { 1884 PyErr_BadInternalCall(); 1885 return -1; 1886 } 1887 unicode = *p_unicode; 1888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1889 { 1890 PyErr_BadInternalCall(); 1891 return -1; 1892 } 1893 return unicode_resize(p_unicode, length); 1894 } 1895 1896 /* Copy an ASCII or latin1 char* string into a Python Unicode string. 1897 1898 WARNING: The function doesn't copy the terminating null character and 1899 doesn't check the maximum character (may write a latin1 character in an 1900 ASCII string). */ 1901 static void 1902 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1903 const char *str, Py_ssize_t len) 1904 { 1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1906 void *data = PyUnicode_DATA(unicode); 1907 const char *end = str + len; 1908 1909 switch (kind) { 1910 case PyUnicode_1BYTE_KIND: { 1911 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1912 #ifdef Py_DEBUG 1913 if (PyUnicode_IS_ASCII(unicode)) { 1914 Py_UCS4 maxchar = ucs1lib_find_max_char( 1915 (const Py_UCS1*)str, 1916 (const Py_UCS1*)str + len); 1917 assert(maxchar < 128); 1918 } 1919 #endif 1920 memcpy((char *) data + index, str, len); 1921 break; 1922 } 1923 case PyUnicode_2BYTE_KIND: { 1924 Py_UCS2 *start = (Py_UCS2 *)data + index; 1925 Py_UCS2 *ucs2 = start; 1926 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1927 1928 for (; str < end; ++ucs2, ++str) 1929 *ucs2 = (Py_UCS2)*str; 1930 1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1932 break; 1933 } 1934 default: { 1935 Py_UCS4 *start = (Py_UCS4 *)data + index; 1936 Py_UCS4 *ucs4 = start; 1937 assert(kind == PyUnicode_4BYTE_KIND); 1938 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1939 1940 for (; str < end; ++ucs4, ++str) 1941 *ucs4 = (Py_UCS4)*str; 1942 1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1944 } 1945 } 1946 } 1947 1948 static PyObject* 1949 get_latin1_char(unsigned char ch) 1950 { 1951 PyObject *unicode = unicode_latin1[ch]; 1952 if (!unicode) { 1953 unicode = PyUnicode_New(1, ch); 1954 if (!unicode) 1955 return NULL; 1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1957 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1958 unicode_latin1[ch] = unicode; 1959 } 1960 Py_INCREF(unicode); 1961 return unicode; 1962 } 1963 1964 static PyObject* 1965 unicode_char(Py_UCS4 ch) 1966 { 1967 PyObject *unicode; 1968 1969 assert(ch <= MAX_UNICODE); 1970 1971 if (ch < 256) 1972 return get_latin1_char(ch); 1973 1974 unicode = PyUnicode_New(1, ch); 1975 if (unicode == NULL) 1976 return NULL; 1977 switch (PyUnicode_KIND(unicode)) { 1978 case PyUnicode_1BYTE_KIND: 1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1980 break; 1981 case PyUnicode_2BYTE_KIND: 1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1983 break; 1984 default: 1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1987 } 1988 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1989 return unicode; 1990 } 1991 1992 PyObject * 1993 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1994 { 1995 PyObject *unicode; 1996 Py_UCS4 maxchar = 0; 1997 Py_ssize_t num_surrogates; 1998 1999 if (u == NULL) 2000 return (PyObject*)_PyUnicode_New(size); 2001 2002 /* If the Unicode data is known at construction time, we can apply 2003 some optimizations which share commonly used objects. */ 2004 2005 /* Optimization for empty strings */ 2006 if (size == 0) 2007 _Py_RETURN_UNICODE_EMPTY(); 2008 2009 /* Single character Unicode objects in the Latin-1 range are 2010 shared when using this constructor */ 2011 if (size == 1 && (Py_UCS4)*u < 256) 2012 return get_latin1_char((unsigned char)*u); 2013 2014 /* If not empty and not single character, copy the Unicode data 2015 into the new object */ 2016 if (find_maxchar_surrogates(u, u + size, 2017 &maxchar, &num_surrogates) == -1) 2018 return NULL; 2019 2020 unicode = PyUnicode_New(size - num_surrogates, maxchar); 2021 if (!unicode) 2022 return NULL; 2023 2024 switch (PyUnicode_KIND(unicode)) { 2025 case PyUnicode_1BYTE_KIND: 2026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 2027 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 2028 break; 2029 case PyUnicode_2BYTE_KIND: 2030 #if Py_UNICODE_SIZE == 2 2031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 2032 #else 2033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 2034 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 2035 #endif 2036 break; 2037 case PyUnicode_4BYTE_KIND: 2038 #if SIZEOF_WCHAR_T == 2 2039 /* This is the only case which has to process surrogates, thus 2040 a simple copy loop is not enough and we need a function. */ 2041 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 2042 #else 2043 assert(num_surrogates == 0); 2044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 2045 #endif 2046 break; 2047 default: 2048 assert(0 && "Impossible state"); 2049 } 2050 2051 return unicode_result(unicode); 2052 } 2053 2054 PyObject * 2055 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 2056 { 2057 if (size < 0) { 2058 PyErr_SetString(PyExc_SystemError, 2059 "Negative size passed to PyUnicode_FromStringAndSize"); 2060 return NULL; 2061 } 2062 if (u != NULL) 2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 2064 else 2065 return (PyObject *)_PyUnicode_New(size); 2066 } 2067 2068 PyObject * 2069 PyUnicode_FromString(const char *u) 2070 { 2071 size_t size = strlen(u); 2072 if (size > PY_SSIZE_T_MAX) { 2073 PyErr_SetString(PyExc_OverflowError, "input too long"); 2074 return NULL; 2075 } 2076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 2077 } 2078 2079 PyObject * 2080 _PyUnicode_FromId(_Py_Identifier *id) 2081 { 2082 if (!id->object) { 2083 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 2084 strlen(id->string), 2085 NULL, NULL); 2086 if (!id->object) 2087 return NULL; 2088 PyUnicode_InternInPlace(&id->object); 2089 assert(!id->next); 2090 id->next = static_strings; 2091 static_strings = id; 2092 } 2093 return id->object; 2094 } 2095 2096 void 2097 _PyUnicode_ClearStaticStrings() 2098 { 2099 _Py_Identifier *tmp, *s = static_strings; 2100 while (s) { 2101 Py_CLEAR(s->object); 2102 tmp = s->next; 2103 s->next = NULL; 2104 s = tmp; 2105 } 2106 static_strings = NULL; 2107 } 2108 2109 /* Internal function, doesn't check maximum character */ 2110 2111 PyObject* 2112 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 2113 { 2114 const unsigned char *s = (const unsigned char *)buffer; 2115 PyObject *unicode; 2116 if (size == 1) { 2117 #ifdef Py_DEBUG 2118 assert((unsigned char)s[0] < 128); 2119 #endif 2120 return get_latin1_char(s[0]); 2121 } 2122 unicode = PyUnicode_New(size, 127); 2123 if (!unicode) 2124 return NULL; 2125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 2126 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2127 return unicode; 2128 } 2129 2130 static Py_UCS4 2131 kind_maxchar_limit(unsigned int kind) 2132 { 2133 switch (kind) { 2134 case PyUnicode_1BYTE_KIND: 2135 return 0x80; 2136 case PyUnicode_2BYTE_KIND: 2137 return 0x100; 2138 case PyUnicode_4BYTE_KIND: 2139 return 0x10000; 2140 default: 2141 assert(0 && "invalid kind"); 2142 return MAX_UNICODE; 2143 } 2144 } 2145 2146 static inline Py_UCS4 2147 align_maxchar(Py_UCS4 maxchar) 2148 { 2149 if (maxchar <= 127) 2150 return 127; 2151 else if (maxchar <= 255) 2152 return 255; 2153 else if (maxchar <= 65535) 2154 return 65535; 2155 else 2156 return MAX_UNICODE; 2157 } 2158 2159 static PyObject* 2160 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 2161 { 2162 PyObject *res; 2163 unsigned char max_char; 2164 2165 if (size == 0) 2166 _Py_RETURN_UNICODE_EMPTY(); 2167 assert(size > 0); 2168 if (size == 1) 2169 return get_latin1_char(u[0]); 2170 2171 max_char = ucs1lib_find_max_char(u, u + size); 2172 res = PyUnicode_New(size, max_char); 2173 if (!res) 2174 return NULL; 2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2176 assert(_PyUnicode_CheckConsistency(res, 1)); 2177 return res; 2178 } 2179 2180 static PyObject* 2181 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2182 { 2183 PyObject *res; 2184 Py_UCS2 max_char; 2185 2186 if (size == 0) 2187 _Py_RETURN_UNICODE_EMPTY(); 2188 assert(size > 0); 2189 if (size == 1) 2190 return unicode_char(u[0]); 2191 2192 max_char = ucs2lib_find_max_char(u, u + size); 2193 res = PyUnicode_New(size, max_char); 2194 if (!res) 2195 return NULL; 2196 if (max_char >= 256) 2197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2198 else { 2199 _PyUnicode_CONVERT_BYTES( 2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2201 } 2202 assert(_PyUnicode_CheckConsistency(res, 1)); 2203 return res; 2204 } 2205 2206 static PyObject* 2207 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2208 { 2209 PyObject *res; 2210 Py_UCS4 max_char; 2211 2212 if (size == 0) 2213 _Py_RETURN_UNICODE_EMPTY(); 2214 assert(size > 0); 2215 if (size == 1) 2216 return unicode_char(u[0]); 2217 2218 max_char = ucs4lib_find_max_char(u, u + size); 2219 res = PyUnicode_New(size, max_char); 2220 if (!res) 2221 return NULL; 2222 if (max_char < 256) 2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2224 PyUnicode_1BYTE_DATA(res)); 2225 else if (max_char < 0x10000) 2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2227 PyUnicode_2BYTE_DATA(res)); 2228 else 2229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2230 assert(_PyUnicode_CheckConsistency(res, 1)); 2231 return res; 2232 } 2233 2234 PyObject* 2235 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2236 { 2237 if (size < 0) { 2238 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2239 return NULL; 2240 } 2241 switch (kind) { 2242 case PyUnicode_1BYTE_KIND: 2243 return _PyUnicode_FromUCS1(buffer, size); 2244 case PyUnicode_2BYTE_KIND: 2245 return _PyUnicode_FromUCS2(buffer, size); 2246 case PyUnicode_4BYTE_KIND: 2247 return _PyUnicode_FromUCS4(buffer, size); 2248 default: 2249 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2250 return NULL; 2251 } 2252 } 2253 2254 Py_UCS4 2255 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2256 { 2257 enum PyUnicode_Kind kind; 2258 void *startptr, *endptr; 2259 2260 assert(PyUnicode_IS_READY(unicode)); 2261 assert(0 <= start); 2262 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2263 assert(start <= end); 2264 2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2266 return PyUnicode_MAX_CHAR_VALUE(unicode); 2267 2268 if (start == end) 2269 return 127; 2270 2271 if (PyUnicode_IS_ASCII(unicode)) 2272 return 127; 2273 2274 kind = PyUnicode_KIND(unicode); 2275 startptr = PyUnicode_DATA(unicode); 2276 endptr = (char *)startptr + end * kind; 2277 startptr = (char *)startptr + start * kind; 2278 switch(kind) { 2279 case PyUnicode_1BYTE_KIND: 2280 return ucs1lib_find_max_char(startptr, endptr); 2281 case PyUnicode_2BYTE_KIND: 2282 return ucs2lib_find_max_char(startptr, endptr); 2283 case PyUnicode_4BYTE_KIND: 2284 return ucs4lib_find_max_char(startptr, endptr); 2285 default: 2286 assert(0); 2287 return 0; 2288 } 2289 } 2290 2291 /* Ensure that a string uses the most efficient storage, if it is not the 2292 case: create a new string with of the right kind. Write NULL into *p_unicode 2293 on error. */ 2294 static void 2295 unicode_adjust_maxchar(PyObject **p_unicode) 2296 { 2297 PyObject *unicode, *copy; 2298 Py_UCS4 max_char; 2299 Py_ssize_t len; 2300 unsigned int kind; 2301 2302 assert(p_unicode != NULL); 2303 unicode = *p_unicode; 2304 assert(PyUnicode_IS_READY(unicode)); 2305 if (PyUnicode_IS_ASCII(unicode)) 2306 return; 2307 2308 len = PyUnicode_GET_LENGTH(unicode); 2309 kind = PyUnicode_KIND(unicode); 2310 if (kind == PyUnicode_1BYTE_KIND) { 2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2312 max_char = ucs1lib_find_max_char(u, u + len); 2313 if (max_char >= 128) 2314 return; 2315 } 2316 else if (kind == PyUnicode_2BYTE_KIND) { 2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2318 max_char = ucs2lib_find_max_char(u, u + len); 2319 if (max_char >= 256) 2320 return; 2321 } 2322 else { 2323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2324 assert(kind == PyUnicode_4BYTE_KIND); 2325 max_char = ucs4lib_find_max_char(u, u + len); 2326 if (max_char >= 0x10000) 2327 return; 2328 } 2329 copy = PyUnicode_New(len, max_char); 2330 if (copy != NULL) 2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2332 Py_DECREF(unicode); 2333 *p_unicode = copy; 2334 } 2335 2336 PyObject* 2337 _PyUnicode_Copy(PyObject *unicode) 2338 { 2339 Py_ssize_t length; 2340 PyObject *copy; 2341 2342 if (!PyUnicode_Check(unicode)) { 2343 PyErr_BadInternalCall(); 2344 return NULL; 2345 } 2346 if (PyUnicode_READY(unicode) == -1) 2347 return NULL; 2348 2349 length = PyUnicode_GET_LENGTH(unicode); 2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2351 if (!copy) 2352 return NULL; 2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2354 2355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2356 length * PyUnicode_KIND(unicode)); 2357 assert(_PyUnicode_CheckConsistency(copy, 1)); 2358 return copy; 2359 } 2360 2361 2362 /* Widen Unicode objects to larger buffers. Don't write terminating null 2363 character. Return NULL on error. */ 2364 2365 void* 2366 _PyUnicode_AsKind(PyObject *s, unsigned int kind) 2367 { 2368 Py_ssize_t len; 2369 void *result; 2370 unsigned int skind; 2371 2372 if (PyUnicode_READY(s) == -1) 2373 return NULL; 2374 2375 len = PyUnicode_GET_LENGTH(s); 2376 skind = PyUnicode_KIND(s); 2377 if (skind >= kind) { 2378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2379 return NULL; 2380 } 2381 switch (kind) { 2382 case PyUnicode_2BYTE_KIND: 2383 result = PyMem_New(Py_UCS2, len); 2384 if (!result) 2385 return PyErr_NoMemory(); 2386 assert(skind == PyUnicode_1BYTE_KIND); 2387 _PyUnicode_CONVERT_BYTES( 2388 Py_UCS1, Py_UCS2, 2389 PyUnicode_1BYTE_DATA(s), 2390 PyUnicode_1BYTE_DATA(s) + len, 2391 result); 2392 return result; 2393 case PyUnicode_4BYTE_KIND: 2394 result = PyMem_New(Py_UCS4, len); 2395 if (!result) 2396 return PyErr_NoMemory(); 2397 if (skind == PyUnicode_2BYTE_KIND) { 2398 _PyUnicode_CONVERT_BYTES( 2399 Py_UCS2, Py_UCS4, 2400 PyUnicode_2BYTE_DATA(s), 2401 PyUnicode_2BYTE_DATA(s) + len, 2402 result); 2403 } 2404 else { 2405 assert(skind == PyUnicode_1BYTE_KIND); 2406 _PyUnicode_CONVERT_BYTES( 2407 Py_UCS1, Py_UCS4, 2408 PyUnicode_1BYTE_DATA(s), 2409 PyUnicode_1BYTE_DATA(s) + len, 2410 result); 2411 } 2412 return result; 2413 default: 2414 break; 2415 } 2416 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2417 return NULL; 2418 } 2419 2420 static Py_UCS4* 2421 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2422 int copy_null) 2423 { 2424 int kind; 2425 void *data; 2426 Py_ssize_t len, targetlen; 2427 if (PyUnicode_READY(string) == -1) 2428 return NULL; 2429 kind = PyUnicode_KIND(string); 2430 data = PyUnicode_DATA(string); 2431 len = PyUnicode_GET_LENGTH(string); 2432 targetlen = len; 2433 if (copy_null) 2434 targetlen++; 2435 if (!target) { 2436 target = PyMem_New(Py_UCS4, targetlen); 2437 if (!target) { 2438 PyErr_NoMemory(); 2439 return NULL; 2440 } 2441 } 2442 else { 2443 if (targetsize < targetlen) { 2444 PyErr_Format(PyExc_SystemError, 2445 "string is longer than the buffer"); 2446 if (copy_null && 0 < targetsize) 2447 target[0] = 0; 2448 return NULL; 2449 } 2450 } 2451 if (kind == PyUnicode_1BYTE_KIND) { 2452 Py_UCS1 *start = (Py_UCS1 *) data; 2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2454 } 2455 else if (kind == PyUnicode_2BYTE_KIND) { 2456 Py_UCS2 *start = (Py_UCS2 *) data; 2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2458 } 2459 else { 2460 assert(kind == PyUnicode_4BYTE_KIND); 2461 memcpy(target, data, len * sizeof(Py_UCS4)); 2462 } 2463 if (copy_null) 2464 target[len] = 0; 2465 return target; 2466 } 2467 2468 Py_UCS4* 2469 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2470 int copy_null) 2471 { 2472 if (target == NULL || targetsize < 0) { 2473 PyErr_BadInternalCall(); 2474 return NULL; 2475 } 2476 return as_ucs4(string, target, targetsize, copy_null); 2477 } 2478 2479 Py_UCS4* 2480 PyUnicode_AsUCS4Copy(PyObject *string) 2481 { 2482 return as_ucs4(string, NULL, 0, 1); 2483 } 2484 2485 #ifdef HAVE_WCHAR_H 2486 2487 PyObject * 2488 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2489 { 2490 if (w == NULL) { 2491 if (size == 0) 2492 _Py_RETURN_UNICODE_EMPTY(); 2493 PyErr_BadInternalCall(); 2494 return NULL; 2495 } 2496 2497 if (size == -1) { 2498 size = wcslen(w); 2499 } 2500 2501 return PyUnicode_FromUnicode(w, size); 2502 } 2503 2504 #endif /* HAVE_WCHAR_H */ 2505 2506 /* maximum number of characters required for output of %lld or %p. 2507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2509 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2510 2511 static int 2512 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2513 Py_ssize_t width, Py_ssize_t precision) 2514 { 2515 Py_ssize_t length, fill, arglen; 2516 Py_UCS4 maxchar; 2517 2518 if (PyUnicode_READY(str) == -1) 2519 return -1; 2520 2521 length = PyUnicode_GET_LENGTH(str); 2522 if ((precision == -1 || precision >= length) 2523 && width <= length) 2524 return _PyUnicodeWriter_WriteStr(writer, str); 2525 2526 if (precision != -1) 2527 length = Py_MIN(precision, length); 2528 2529 arglen = Py_MAX(length, width); 2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2532 else 2533 maxchar = writer->maxchar; 2534 2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2536 return -1; 2537 2538 if (width > length) { 2539 fill = width - length; 2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2541 return -1; 2542 writer->pos += fill; 2543 } 2544 2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2546 str, 0, length); 2547 writer->pos += length; 2548 return 0; 2549 } 2550 2551 static int 2552 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2553 Py_ssize_t width, Py_ssize_t precision) 2554 { 2555 /* UTF-8 */ 2556 Py_ssize_t length; 2557 PyObject *unicode; 2558 int res; 2559 2560 length = strlen(str); 2561 if (precision != -1) 2562 length = Py_MIN(length, precision); 2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2564 if (unicode == NULL) 2565 return -1; 2566 2567 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2568 Py_DECREF(unicode); 2569 return res; 2570 } 2571 2572 static const char* 2573 unicode_fromformat_arg(_PyUnicodeWriter *writer, 2574 const char *f, va_list *vargs) 2575 { 2576 const char *p; 2577 Py_ssize_t len; 2578 int zeropad; 2579 Py_ssize_t width; 2580 Py_ssize_t precision; 2581 int longflag; 2582 int longlongflag; 2583 int size_tflag; 2584 Py_ssize_t fill; 2585 2586 p = f; 2587 f++; 2588 zeropad = 0; 2589 if (*f == '0') { 2590 zeropad = 1; 2591 f++; 2592 } 2593 2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2595 width = -1; 2596 if (Py_ISDIGIT((unsigned)*f)) { 2597 width = *f - '0'; 2598 f++; 2599 while (Py_ISDIGIT((unsigned)*f)) { 2600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2601 PyErr_SetString(PyExc_ValueError, 2602 "width too big"); 2603 return NULL; 2604 } 2605 width = (width * 10) + (*f - '0'); 2606 f++; 2607 } 2608 } 2609 precision = -1; 2610 if (*f == '.') { 2611 f++; 2612 if (Py_ISDIGIT((unsigned)*f)) { 2613 precision = (*f - '0'); 2614 f++; 2615 while (Py_ISDIGIT((unsigned)*f)) { 2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2617 PyErr_SetString(PyExc_ValueError, 2618 "precision too big"); 2619 return NULL; 2620 } 2621 precision = (precision * 10) + (*f - '0'); 2622 f++; 2623 } 2624 } 2625 if (*f == '%') { 2626 /* "%.3%s" => f points to "3" */ 2627 f--; 2628 } 2629 } 2630 if (*f == '\0') { 2631 /* bogus format "%.123" => go backward, f points to "3" */ 2632 f--; 2633 } 2634 2635 /* Handle %ld, %lu, %lld and %llu. */ 2636 longflag = 0; 2637 longlongflag = 0; 2638 size_tflag = 0; 2639 if (*f == 'l') { 2640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2641 longflag = 1; 2642 ++f; 2643 } 2644 else if (f[1] == 'l' && 2645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2646 longlongflag = 1; 2647 f += 2; 2648 } 2649 } 2650 /* handle the size_t flag. */ 2651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2652 size_tflag = 1; 2653 ++f; 2654 } 2655 2656 if (f[1] == '\0') 2657 writer->overallocate = 0; 2658 2659 switch (*f) { 2660 case 'c': 2661 { 2662 int ordinal = va_arg(*vargs, int); 2663 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2664 PyErr_SetString(PyExc_OverflowError, 2665 "character argument not in range(0x110000)"); 2666 return NULL; 2667 } 2668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2669 return NULL; 2670 break; 2671 } 2672 2673 case 'i': 2674 case 'd': 2675 case 'u': 2676 case 'x': 2677 { 2678 /* used by sprintf */ 2679 char buffer[MAX_LONG_LONG_CHARS]; 2680 Py_ssize_t arglen; 2681 2682 if (*f == 'u') { 2683 if (longflag) 2684 len = sprintf(buffer, "%lu", 2685 va_arg(*vargs, unsigned long)); 2686 else if (longlongflag) 2687 len = sprintf(buffer, "%llu", 2688 va_arg(*vargs, unsigned long long)); 2689 else if (size_tflag) 2690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2691 va_arg(*vargs, size_t)); 2692 else 2693 len = sprintf(buffer, "%u", 2694 va_arg(*vargs, unsigned int)); 2695 } 2696 else if (*f == 'x') { 2697 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2698 } 2699 else { 2700 if (longflag) 2701 len = sprintf(buffer, "%li", 2702 va_arg(*vargs, long)); 2703 else if (longlongflag) 2704 len = sprintf(buffer, "%lli", 2705 va_arg(*vargs, long long)); 2706 else if (size_tflag) 2707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2708 va_arg(*vargs, Py_ssize_t)); 2709 else 2710 len = sprintf(buffer, "%i", 2711 va_arg(*vargs, int)); 2712 } 2713 assert(len >= 0); 2714 2715 if (precision < len) 2716 precision = len; 2717 2718 arglen = Py_MAX(precision, width); 2719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2720 return NULL; 2721 2722 if (width > precision) { 2723 Py_UCS4 fillchar; 2724 fill = width - precision; 2725 fillchar = zeropad?'0':' '; 2726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2727 return NULL; 2728 writer->pos += fill; 2729 } 2730 if (precision > len) { 2731 fill = precision - len; 2732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2733 return NULL; 2734 writer->pos += fill; 2735 } 2736 2737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2738 return NULL; 2739 break; 2740 } 2741 2742 case 'p': 2743 { 2744 char number[MAX_LONG_LONG_CHARS]; 2745 2746 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2747 assert(len >= 0); 2748 2749 /* %p is ill-defined: ensure leading 0x. */ 2750 if (number[1] == 'X') 2751 number[1] = 'x'; 2752 else if (number[1] != 'x') { 2753 memmove(number + 2, number, 2754 strlen(number) + 1); 2755 number[0] = '0'; 2756 number[1] = 'x'; 2757 len += 2; 2758 } 2759 2760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2761 return NULL; 2762 break; 2763 } 2764 2765 case 's': 2766 { 2767 /* UTF-8 */ 2768 const char *s = va_arg(*vargs, const char*); 2769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2770 return NULL; 2771 break; 2772 } 2773 2774 case 'U': 2775 { 2776 PyObject *obj = va_arg(*vargs, PyObject *); 2777 assert(obj && _PyUnicode_CHECK(obj)); 2778 2779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2780 return NULL; 2781 break; 2782 } 2783 2784 case 'V': 2785 { 2786 PyObject *obj = va_arg(*vargs, PyObject *); 2787 const char *str = va_arg(*vargs, const char *); 2788 if (obj) { 2789 assert(_PyUnicode_CHECK(obj)); 2790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2791 return NULL; 2792 } 2793 else { 2794 assert(str != NULL); 2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2796 return NULL; 2797 } 2798 break; 2799 } 2800 2801 case 'S': 2802 { 2803 PyObject *obj = va_arg(*vargs, PyObject *); 2804 PyObject *str; 2805 assert(obj); 2806 str = PyObject_Str(obj); 2807 if (!str) 2808 return NULL; 2809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2810 Py_DECREF(str); 2811 return NULL; 2812 } 2813 Py_DECREF(str); 2814 break; 2815 } 2816 2817 case 'R': 2818 { 2819 PyObject *obj = va_arg(*vargs, PyObject *); 2820 PyObject *repr; 2821 assert(obj); 2822 repr = PyObject_Repr(obj); 2823 if (!repr) 2824 return NULL; 2825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2826 Py_DECREF(repr); 2827 return NULL; 2828 } 2829 Py_DECREF(repr); 2830 break; 2831 } 2832 2833 case 'A': 2834 { 2835 PyObject *obj = va_arg(*vargs, PyObject *); 2836 PyObject *ascii; 2837 assert(obj); 2838 ascii = PyObject_ASCII(obj); 2839 if (!ascii) 2840 return NULL; 2841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2842 Py_DECREF(ascii); 2843 return NULL; 2844 } 2845 Py_DECREF(ascii); 2846 break; 2847 } 2848 2849 case '%': 2850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2851 return NULL; 2852 break; 2853 2854 default: 2855 /* if we stumble upon an unknown formatting code, copy the rest 2856 of the format string to the output string. (we cannot just 2857 skip the code, since there's no way to know what's in the 2858 argument list) */ 2859 len = strlen(p); 2860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2861 return NULL; 2862 f = p+len; 2863 return f; 2864 } 2865 2866 f++; 2867 return f; 2868 } 2869 2870 PyObject * 2871 PyUnicode_FromFormatV(const char *format, va_list vargs) 2872 { 2873 va_list vargs2; 2874 const char *f; 2875 _PyUnicodeWriter writer; 2876 2877 _PyUnicodeWriter_Init(&writer); 2878 writer.min_length = strlen(format) + 100; 2879 writer.overallocate = 1; 2880 2881 // Copy varags to be able to pass a reference to a subfunction. 2882 va_copy(vargs2, vargs); 2883 2884 for (f = format; *f; ) { 2885 if (*f == '%') { 2886 f = unicode_fromformat_arg(&writer, f, &vargs2); 2887 if (f == NULL) 2888 goto fail; 2889 } 2890 else { 2891 const char *p; 2892 Py_ssize_t len; 2893 2894 p = f; 2895 do 2896 { 2897 if ((unsigned char)*p > 127) { 2898 PyErr_Format(PyExc_ValueError, 2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2900 "string, got a non-ASCII byte: 0x%02x", 2901 (unsigned char)*p); 2902 goto fail; 2903 } 2904 p++; 2905 } 2906 while (*p != '\0' && *p != '%'); 2907 len = p - f; 2908 2909 if (*p == '\0') 2910 writer.overallocate = 0; 2911 2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2913 goto fail; 2914 2915 f = p; 2916 } 2917 } 2918 va_end(vargs2); 2919 return _PyUnicodeWriter_Finish(&writer); 2920 2921 fail: 2922 va_end(vargs2); 2923 _PyUnicodeWriter_Dealloc(&writer); 2924 return NULL; 2925 } 2926 2927 PyObject * 2928 PyUnicode_FromFormat(const char *format, ...) 2929 { 2930 PyObject* ret; 2931 va_list vargs; 2932 2933 #ifdef HAVE_STDARG_PROTOTYPES 2934 va_start(vargs, format); 2935 #else 2936 va_start(vargs); 2937 #endif 2938 ret = PyUnicode_FromFormatV(format, vargs); 2939 va_end(vargs); 2940 return ret; 2941 } 2942 2943 #ifdef HAVE_WCHAR_H 2944 2945 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2946 convert a Unicode object to a wide character string. 2947 2948 - If w is NULL: return the number of wide characters (including the null 2949 character) required to convert the unicode object. Ignore size argument. 2950 2951 - Otherwise: return the number of wide characters (excluding the null 2952 character) written into w. Write at most size wide characters (including 2953 the null character). */ 2954 static Py_ssize_t 2955 unicode_aswidechar(PyObject *unicode, 2956 wchar_t *w, 2957 Py_ssize_t size) 2958 { 2959 Py_ssize_t res; 2960 const wchar_t *wstr; 2961 2962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2963 if (wstr == NULL) 2964 return -1; 2965 2966 if (w != NULL) { 2967 if (size > res) 2968 size = res + 1; 2969 else 2970 res = size; 2971 memcpy(w, wstr, size * sizeof(wchar_t)); 2972 return res; 2973 } 2974 else 2975 return res + 1; 2976 } 2977 2978 Py_ssize_t 2979 PyUnicode_AsWideChar(PyObject *unicode, 2980 wchar_t *w, 2981 Py_ssize_t size) 2982 { 2983 if (unicode == NULL) { 2984 PyErr_BadInternalCall(); 2985 return -1; 2986 } 2987 return unicode_aswidechar(unicode, w, size); 2988 } 2989 2990 wchar_t* 2991 PyUnicode_AsWideCharString(PyObject *unicode, 2992 Py_ssize_t *size) 2993 { 2994 wchar_t* buffer; 2995 Py_ssize_t buflen; 2996 2997 if (unicode == NULL) { 2998 PyErr_BadInternalCall(); 2999 return NULL; 3000 } 3001 3002 buflen = unicode_aswidechar(unicode, NULL, 0); 3003 if (buflen == -1) 3004 return NULL; 3005 buffer = PyMem_NEW(wchar_t, buflen); 3006 if (buffer == NULL) { 3007 PyErr_NoMemory(); 3008 return NULL; 3009 } 3010 buflen = unicode_aswidechar(unicode, buffer, buflen); 3011 if (buflen == -1) { 3012 PyMem_FREE(buffer); 3013 return NULL; 3014 } 3015 if (size != NULL) 3016 *size = buflen; 3017 return buffer; 3018 } 3019 3020 #endif /* HAVE_WCHAR_H */ 3021 3022 PyObject * 3023 PyUnicode_FromOrdinal(int ordinal) 3024 { 3025 if (ordinal < 0 || ordinal > MAX_UNICODE) { 3026 PyErr_SetString(PyExc_ValueError, 3027 "chr() arg not in range(0x110000)"); 3028 return NULL; 3029 } 3030 3031 return unicode_char((Py_UCS4)ordinal); 3032 } 3033 3034 PyObject * 3035 PyUnicode_FromObject(PyObject *obj) 3036 { 3037 /* XXX Perhaps we should make this API an alias of 3038 PyObject_Str() instead ?! */ 3039 if (PyUnicode_CheckExact(obj)) { 3040 if (PyUnicode_READY(obj) == -1) 3041 return NULL; 3042 Py_INCREF(obj); 3043 return obj; 3044 } 3045 if (PyUnicode_Check(obj)) { 3046 /* For a Unicode subtype that's not a Unicode object, 3047 return a true Unicode object with the same data. */ 3048 return _PyUnicode_Copy(obj); 3049 } 3050 PyErr_Format(PyExc_TypeError, 3051 "Can't convert '%.100s' object to str implicitly", 3052 Py_TYPE(obj)->tp_name); 3053 return NULL; 3054 } 3055 3056 PyObject * 3057 PyUnicode_FromEncodedObject(PyObject *obj, 3058 const char *encoding, 3059 const char *errors) 3060 { 3061 Py_buffer buffer; 3062 PyObject *v; 3063 3064 if (obj == NULL) { 3065 PyErr_BadInternalCall(); 3066 return NULL; 3067 } 3068 3069 /* Decoding bytes objects is the most common case and should be fast */ 3070 if (PyBytes_Check(obj)) { 3071 if (PyBytes_GET_SIZE(obj) == 0) 3072 _Py_RETURN_UNICODE_EMPTY(); 3073 v = PyUnicode_Decode( 3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3075 encoding, errors); 3076 return v; 3077 } 3078 3079 if (PyUnicode_Check(obj)) { 3080 PyErr_SetString(PyExc_TypeError, 3081 "decoding str is not supported"); 3082 return NULL; 3083 } 3084 3085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3087 PyErr_Format(PyExc_TypeError, 3088 "decoding to str: need a bytes-like object, %.80s found", 3089 Py_TYPE(obj)->tp_name); 3090 return NULL; 3091 } 3092 3093 if (buffer.len == 0) { 3094 PyBuffer_Release(&buffer); 3095 _Py_RETURN_UNICODE_EMPTY(); 3096 } 3097 3098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3099 PyBuffer_Release(&buffer); 3100 return v; 3101 } 3102 3103 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but 3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is 3105 longer than lower_len-1). */ 3106 int 3107 _Py_normalize_encoding(const char *encoding, 3108 char *lower, 3109 size_t lower_len) 3110 { 3111 const char *e; 3112 char *l; 3113 char *l_end; 3114 int punct; 3115 3116 assert(encoding != NULL); 3117 3118 e = encoding; 3119 l = lower; 3120 l_end = &lower[lower_len - 1]; 3121 punct = 0; 3122 while (1) { 3123 char c = *e; 3124 if (c == 0) { 3125 break; 3126 } 3127 3128 if (Py_ISALNUM(c) || c == '.') { 3129 if (punct && l != lower) { 3130 if (l == l_end) { 3131 return 0; 3132 } 3133 *l++ = '_'; 3134 } 3135 punct = 0; 3136 3137 if (l == l_end) { 3138 return 0; 3139 } 3140 *l++ = Py_TOLOWER(c); 3141 } 3142 else { 3143 punct = 1; 3144 } 3145 3146 e++; 3147 } 3148 *l = '\0'; 3149 return 1; 3150 } 3151 3152 PyObject * 3153 PyUnicode_Decode(const char *s, 3154 Py_ssize_t size, 3155 const char *encoding, 3156 const char *errors) 3157 { 3158 PyObject *buffer = NULL, *unicode; 3159 Py_buffer info; 3160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ 3161 3162 if (encoding == NULL) { 3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3164 } 3165 3166 /* Shortcuts for common default encodings */ 3167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3168 char *lower = buflower; 3169 3170 /* Fast paths */ 3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3172 lower += 3; 3173 if (*lower == '_') { 3174 /* Match "utf8" and "utf_8" */ 3175 lower++; 3176 } 3177 3178 if (lower[0] == '8' && lower[1] == 0) { 3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3180 } 3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3182 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3183 } 3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3185 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3186 } 3187 } 3188 else { 3189 if (strcmp(lower, "ascii") == 0 3190 || strcmp(lower, "us_ascii") == 0) { 3191 return PyUnicode_DecodeASCII(s, size, errors); 3192 } 3193 #ifdef MS_WINDOWS 3194 else if (strcmp(lower, "mbcs") == 0) { 3195 return PyUnicode_DecodeMBCS(s, size, errors); 3196 } 3197 #endif 3198 else if (strcmp(lower, "latin1") == 0 3199 || strcmp(lower, "latin_1") == 0 3200 || strcmp(lower, "iso_8859_1") == 0 3201 || strcmp(lower, "iso8859_1") == 0) { 3202 return PyUnicode_DecodeLatin1(s, size, errors); 3203 } 3204 } 3205 } 3206 3207 /* Decode via the codec registry */ 3208 buffer = NULL; 3209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3210 goto onError; 3211 buffer = PyMemoryView_FromBuffer(&info); 3212 if (buffer == NULL) 3213 goto onError; 3214 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3215 if (unicode == NULL) 3216 goto onError; 3217 if (!PyUnicode_Check(unicode)) { 3218 PyErr_Format(PyExc_TypeError, 3219 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3220 "use codecs.decode() to decode to arbitrary types", 3221 encoding, 3222 Py_TYPE(unicode)->tp_name); 3223 Py_DECREF(unicode); 3224 goto onError; 3225 } 3226 Py_DECREF(buffer); 3227 return unicode_result(unicode); 3228 3229 onError: 3230 Py_XDECREF(buffer); 3231 return NULL; 3232 } 3233 3234 PyObject * 3235 PyUnicode_AsDecodedObject(PyObject *unicode, 3236 const char *encoding, 3237 const char *errors) 3238 { 3239 if (!PyUnicode_Check(unicode)) { 3240 PyErr_BadArgument(); 3241 return NULL; 3242 } 3243 3244 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3245 "PyUnicode_AsDecodedObject() is deprecated; " 3246 "use PyCodec_Decode() to decode from str", 1) < 0) 3247 return NULL; 3248 3249 if (encoding == NULL) 3250 encoding = PyUnicode_GetDefaultEncoding(); 3251 3252 /* Decode via the codec registry */ 3253 return PyCodec_Decode(unicode, encoding, errors); 3254 } 3255 3256 PyObject * 3257 PyUnicode_AsDecodedUnicode(PyObject *unicode, 3258 const char *encoding, 3259 const char *errors) 3260 { 3261 PyObject *v; 3262 3263 if (!PyUnicode_Check(unicode)) { 3264 PyErr_BadArgument(); 3265 goto onError; 3266 } 3267 3268 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3269 "PyUnicode_AsDecodedUnicode() is deprecated; " 3270 "use PyCodec_Decode() to decode from str to str", 1) < 0) 3271 return NULL; 3272 3273 if (encoding == NULL) 3274 encoding = PyUnicode_GetDefaultEncoding(); 3275 3276 /* Decode via the codec registry */ 3277 v = PyCodec_Decode(unicode, encoding, errors); 3278 if (v == NULL) 3279 goto onError; 3280 if (!PyUnicode_Check(v)) { 3281 PyErr_Format(PyExc_TypeError, 3282 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3283 "use codecs.decode() to decode to arbitrary types", 3284 encoding, 3285 Py_TYPE(unicode)->tp_name); 3286 Py_DECREF(v); 3287 goto onError; 3288 } 3289 return unicode_result(v); 3290 3291 onError: 3292 return NULL; 3293 } 3294 3295 PyObject * 3296 PyUnicode_Encode(const Py_UNICODE *s, 3297 Py_ssize_t size, 3298 const char *encoding, 3299 const char *errors) 3300 { 3301 PyObject *v, *unicode; 3302 3303 unicode = PyUnicode_FromUnicode(s, size); 3304 if (unicode == NULL) 3305 return NULL; 3306 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3307 Py_DECREF(unicode); 3308 return v; 3309 } 3310 3311 PyObject * 3312 PyUnicode_AsEncodedObject(PyObject *unicode, 3313 const char *encoding, 3314 const char *errors) 3315 { 3316 PyObject *v; 3317 3318 if (!PyUnicode_Check(unicode)) { 3319 PyErr_BadArgument(); 3320 goto onError; 3321 } 3322 3323 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3324 "PyUnicode_AsEncodedObject() is deprecated; " 3325 "use PyUnicode_AsEncodedString() to encode from str to bytes " 3326 "or PyCodec_Encode() for generic encoding", 1) < 0) 3327 return NULL; 3328 3329 if (encoding == NULL) 3330 encoding = PyUnicode_GetDefaultEncoding(); 3331 3332 /* Encode via the codec registry */ 3333 v = PyCodec_Encode(unicode, encoding, errors); 3334 if (v == NULL) 3335 goto onError; 3336 return v; 3337 3338 onError: 3339 return NULL; 3340 } 3341 3342 static size_t 3343 wcstombs_errorpos(const wchar_t *wstr) 3344 { 3345 size_t len; 3346 #if SIZEOF_WCHAR_T == 2 3347 wchar_t buf[3]; 3348 #else 3349 wchar_t buf[2]; 3350 #endif 3351 char outbuf[MB_LEN_MAX]; 3352 const wchar_t *start, *previous; 3353 3354 #if SIZEOF_WCHAR_T == 2 3355 buf[2] = 0; 3356 #else 3357 buf[1] = 0; 3358 #endif 3359 start = wstr; 3360 while (*wstr != L'\0') 3361 { 3362 previous = wstr; 3363 #if SIZEOF_WCHAR_T == 2 3364 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3365 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3366 { 3367 buf[0] = wstr[0]; 3368 buf[1] = wstr[1]; 3369 wstr += 2; 3370 } 3371 else { 3372 buf[0] = *wstr; 3373 buf[1] = 0; 3374 wstr++; 3375 } 3376 #else 3377 buf[0] = *wstr; 3378 wstr++; 3379 #endif 3380 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3381 if (len == (size_t)-1) 3382 return previous - start; 3383 } 3384 3385 /* failed to find the unencodable character */ 3386 return 0; 3387 } 3388 3389 static int 3390 locale_error_handler(const char *errors, int *surrogateescape) 3391 { 3392 _Py_error_handler error_handler = get_error_handler(errors); 3393 switch (error_handler) 3394 { 3395 case _Py_ERROR_STRICT: 3396 *surrogateescape = 0; 3397 return 0; 3398 case _Py_ERROR_SURROGATEESCAPE: 3399 *surrogateescape = 1; 3400 return 0; 3401 default: 3402 PyErr_Format(PyExc_ValueError, 3403 "only 'strict' and 'surrogateescape' error handlers " 3404 "are supported, not '%s'", 3405 errors); 3406 return -1; 3407 } 3408 } 3409 3410 PyObject * 3411 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3412 { 3413 Py_ssize_t wlen, wlen2; 3414 wchar_t *wstr; 3415 PyObject *bytes = NULL; 3416 char *errmsg; 3417 PyObject *reason = NULL; 3418 PyObject *exc; 3419 size_t error_pos; 3420 int surrogateescape; 3421 3422 if (locale_error_handler(errors, &surrogateescape) < 0) 3423 return NULL; 3424 3425 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3426 if (wstr == NULL) 3427 return NULL; 3428 3429 wlen2 = wcslen(wstr); 3430 if (wlen2 != wlen) { 3431 PyMem_Free(wstr); 3432 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3433 return NULL; 3434 } 3435 3436 if (surrogateescape) { 3437 /* "surrogateescape" error handler */ 3438 char *str; 3439 3440 str = Py_EncodeLocale(wstr, &error_pos); 3441 if (str == NULL) { 3442 if (error_pos == (size_t)-1) { 3443 PyErr_NoMemory(); 3444 PyMem_Free(wstr); 3445 return NULL; 3446 } 3447 else { 3448 goto encode_error; 3449 } 3450 } 3451 PyMem_Free(wstr); 3452 3453 bytes = PyBytes_FromString(str); 3454 PyMem_Free(str); 3455 } 3456 else { 3457 /* strict mode */ 3458 size_t len, len2; 3459 3460 len = wcstombs(NULL, wstr, 0); 3461 if (len == (size_t)-1) { 3462 error_pos = (size_t)-1; 3463 goto encode_error; 3464 } 3465 3466 bytes = PyBytes_FromStringAndSize(NULL, len); 3467 if (bytes == NULL) { 3468 PyMem_Free(wstr); 3469 return NULL; 3470 } 3471 3472 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3473 if (len2 == (size_t)-1 || len2 > len) { 3474 error_pos = (size_t)-1; 3475 goto encode_error; 3476 } 3477 PyMem_Free(wstr); 3478 } 3479 return bytes; 3480 3481 encode_error: 3482 errmsg = strerror(errno); 3483 assert(errmsg != NULL); 3484 3485 if (error_pos == (size_t)-1) 3486 error_pos = wcstombs_errorpos(wstr); 3487 3488 PyMem_Free(wstr); 3489 Py_XDECREF(bytes); 3490 3491 if (errmsg != NULL) { 3492 size_t errlen; 3493 wstr = Py_DecodeLocale(errmsg, &errlen); 3494 if (wstr != NULL) { 3495 reason = PyUnicode_FromWideChar(wstr, errlen); 3496 PyMem_RawFree(wstr); 3497 } else 3498 errmsg = NULL; 3499 } 3500 if (errmsg == NULL) 3501 reason = PyUnicode_FromString( 3502 "wcstombs() encountered an unencodable " 3503 "wide character"); 3504 if (reason == NULL) 3505 return NULL; 3506 3507 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3508 "locale", unicode, 3509 (Py_ssize_t)error_pos, 3510 (Py_ssize_t)(error_pos+1), 3511 reason); 3512 Py_DECREF(reason); 3513 if (exc != NULL) { 3514 PyCodec_StrictErrors(exc); 3515 Py_XDECREF(exc); 3516 } 3517 return NULL; 3518 } 3519 3520 PyObject * 3521 PyUnicode_EncodeFSDefault(PyObject *unicode) 3522 { 3523 #if defined(__APPLE__) 3524 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors); 3525 #else 3526 PyInterpreterState *interp = PyThreadState_GET()->interp; 3527 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3528 cannot use it to encode and decode filenames before it is loaded. Load 3529 the Python codec requires to encode at least its own filename. Use the C 3530 version of the locale codec until the codec registry is initialized and 3531 the Python codec is loaded. 3532 3533 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3534 cannot only rely on it: check also interp->fscodec_initialized for 3535 subinterpreters. */ 3536 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3537 return PyUnicode_AsEncodedString(unicode, 3538 Py_FileSystemDefaultEncoding, 3539 Py_FileSystemDefaultEncodeErrors); 3540 } 3541 else { 3542 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors); 3543 } 3544 #endif 3545 } 3546 3547 PyObject * 3548 PyUnicode_AsEncodedString(PyObject *unicode, 3549 const char *encoding, 3550 const char *errors) 3551 { 3552 PyObject *v; 3553 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */ 3554 3555 if (!PyUnicode_Check(unicode)) { 3556 PyErr_BadArgument(); 3557 return NULL; 3558 } 3559 3560 if (encoding == NULL) { 3561 return _PyUnicode_AsUTF8String(unicode, errors); 3562 } 3563 3564 /* Shortcuts for common default encodings */ 3565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3566 char *lower = buflower; 3567 3568 /* Fast paths */ 3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3570 lower += 3; 3571 if (*lower == '_') { 3572 /* Match "utf8" and "utf_8" */ 3573 lower++; 3574 } 3575 3576 if (lower[0] == '8' && lower[1] == 0) { 3577 return _PyUnicode_AsUTF8String(unicode, errors); 3578 } 3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3580 return _PyUnicode_EncodeUTF16(unicode, errors, 0); 3581 } 3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3583 return _PyUnicode_EncodeUTF32(unicode, errors, 0); 3584 } 3585 } 3586 else { 3587 if (strcmp(lower, "ascii") == 0 3588 || strcmp(lower, "us_ascii") == 0) { 3589 return _PyUnicode_AsASCIIString(unicode, errors); 3590 } 3591 #ifdef MS_WINDOWS 3592 else if (strcmp(lower, "mbcs") == 0) { 3593 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3594 } 3595 #endif 3596 else if (strcmp(lower, "latin1") == 0 || 3597 strcmp(lower, "latin_1") == 0 || 3598 strcmp(lower, "iso_8859_1") == 0 || 3599 strcmp(lower, "iso8859_1") == 0) { 3600 return _PyUnicode_AsLatin1String(unicode, errors); 3601 } 3602 } 3603 } 3604 3605 /* Encode via the codec registry */ 3606 v = _PyCodec_EncodeText(unicode, encoding, errors); 3607 if (v == NULL) 3608 return NULL; 3609 3610 /* The normal path */ 3611 if (PyBytes_Check(v)) 3612 return v; 3613 3614 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3615 if (PyByteArray_Check(v)) { 3616 int error; 3617 PyObject *b; 3618 3619 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3620 "encoder %s returned bytearray instead of bytes; " 3621 "use codecs.encode() to encode to arbitrary types", 3622 encoding); 3623 if (error) { 3624 Py_DECREF(v); 3625 return NULL; 3626 } 3627 3628 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3629 Py_DECREF(v); 3630 return b; 3631 } 3632 3633 PyErr_Format(PyExc_TypeError, 3634 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3635 "use codecs.encode() to encode to arbitrary types", 3636 encoding, 3637 Py_TYPE(v)->tp_name); 3638 Py_DECREF(v); 3639 return NULL; 3640 } 3641 3642 PyObject * 3643 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3644 const char *encoding, 3645 const char *errors) 3646 { 3647 PyObject *v; 3648 3649 if (!PyUnicode_Check(unicode)) { 3650 PyErr_BadArgument(); 3651 goto onError; 3652 } 3653 3654 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3655 "PyUnicode_AsEncodedUnicode() is deprecated; " 3656 "use PyCodec_Encode() to encode from str to str", 1) < 0) 3657 return NULL; 3658 3659 if (encoding == NULL) 3660 encoding = PyUnicode_GetDefaultEncoding(); 3661 3662 /* Encode via the codec registry */ 3663 v = PyCodec_Encode(unicode, encoding, errors); 3664 if (v == NULL) 3665 goto onError; 3666 if (!PyUnicode_Check(v)) { 3667 PyErr_Format(PyExc_TypeError, 3668 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3669 "use codecs.encode() to encode to arbitrary types", 3670 encoding, 3671 Py_TYPE(v)->tp_name); 3672 Py_DECREF(v); 3673 goto onError; 3674 } 3675 return v; 3676 3677 onError: 3678 return NULL; 3679 } 3680 3681 static size_t 3682 mbstowcs_errorpos(const char *str, size_t len) 3683 { 3684 #ifdef HAVE_MBRTOWC 3685 const char *start = str; 3686 mbstate_t mbs; 3687 size_t converted; 3688 wchar_t ch; 3689 3690 memset(&mbs, 0, sizeof mbs); 3691 while (len) 3692 { 3693 converted = mbrtowc(&ch, str, len, &mbs); 3694 if (converted == 0) 3695 /* Reached end of string */ 3696 break; 3697 if (converted == (size_t)-1 || converted == (size_t)-2) { 3698 /* Conversion error or incomplete character */ 3699 return str - start; 3700 } 3701 else { 3702 str += converted; 3703 len -= converted; 3704 } 3705 } 3706 /* failed to find the undecodable byte sequence */ 3707 return 0; 3708 #endif 3709 return 0; 3710 } 3711 3712 PyObject* 3713 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3714 const char *errors) 3715 { 3716 wchar_t smallbuf[256]; 3717 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3718 wchar_t *wstr; 3719 size_t wlen, wlen2; 3720 PyObject *unicode; 3721 int surrogateescape; 3722 size_t error_pos; 3723 char *errmsg; 3724 PyObject *reason = NULL; /* initialize to prevent gcc warning */ 3725 PyObject *exc; 3726 3727 if (locale_error_handler(errors, &surrogateescape) < 0) 3728 return NULL; 3729 3730 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3731 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3732 return NULL; 3733 } 3734 3735 if (surrogateescape) { 3736 /* "surrogateescape" error handler */ 3737 wstr = Py_DecodeLocale(str, &wlen); 3738 if (wstr == NULL) { 3739 if (wlen == (size_t)-1) 3740 PyErr_NoMemory(); 3741 else 3742 PyErr_SetFromErrno(PyExc_OSError); 3743 return NULL; 3744 } 3745 3746 unicode = PyUnicode_FromWideChar(wstr, wlen); 3747 PyMem_RawFree(wstr); 3748 } 3749 else { 3750 /* strict mode */ 3751 #ifndef HAVE_BROKEN_MBSTOWCS 3752 wlen = mbstowcs(NULL, str, 0); 3753 #else 3754 wlen = len; 3755 #endif 3756 if (wlen == (size_t)-1) 3757 goto decode_error; 3758 if (wlen+1 <= smallbuf_len) { 3759 wstr = smallbuf; 3760 } 3761 else { 3762 wstr = PyMem_New(wchar_t, wlen+1); 3763 if (!wstr) 3764 return PyErr_NoMemory(); 3765 } 3766 3767 wlen2 = mbstowcs(wstr, str, wlen+1); 3768 if (wlen2 == (size_t)-1) { 3769 if (wstr != smallbuf) 3770 PyMem_Free(wstr); 3771 goto decode_error; 3772 } 3773 #ifdef HAVE_BROKEN_MBSTOWCS 3774 assert(wlen2 == wlen); 3775 #endif 3776 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3777 if (wstr != smallbuf) 3778 PyMem_Free(wstr); 3779 } 3780 return unicode; 3781 3782 decode_error: 3783 reason = NULL; 3784 errmsg = strerror(errno); 3785 assert(errmsg != NULL); 3786 3787 error_pos = mbstowcs_errorpos(str, len); 3788 if (errmsg != NULL) { 3789 size_t errlen; 3790 wstr = Py_DecodeLocale(errmsg, &errlen); 3791 if (wstr != NULL) { 3792 reason = PyUnicode_FromWideChar(wstr, errlen); 3793 PyMem_RawFree(wstr); 3794 } 3795 } 3796 if (reason == NULL) 3797 reason = PyUnicode_FromString( 3798 "mbstowcs() encountered an invalid multibyte sequence"); 3799 if (reason == NULL) 3800 return NULL; 3801 3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3803 "locale", str, len, 3804 (Py_ssize_t)error_pos, 3805 (Py_ssize_t)(error_pos+1), 3806 reason); 3807 Py_DECREF(reason); 3808 if (exc != NULL) { 3809 PyCodec_StrictErrors(exc); 3810 Py_XDECREF(exc); 3811 } 3812 return NULL; 3813 } 3814 3815 PyObject* 3816 PyUnicode_DecodeLocale(const char *str, const char *errors) 3817 { 3818 Py_ssize_t size = (Py_ssize_t)strlen(str); 3819 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3820 } 3821 3822 3823 PyObject* 3824 PyUnicode_DecodeFSDefault(const char *s) { 3825 Py_ssize_t size = (Py_ssize_t)strlen(s); 3826 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3827 } 3828 3829 PyObject* 3830 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3831 { 3832 #if defined(__APPLE__) 3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL); 3834 #else 3835 PyInterpreterState *interp = PyThreadState_GET()->interp; 3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3837 cannot use it to encode and decode filenames before it is loaded. Load 3838 the Python codec requires to encode at least its own filename. Use the C 3839 version of the locale codec until the codec registry is initialized and 3840 the Python codec is loaded. 3841 3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3843 cannot only rely on it: check also interp->fscodec_initialized for 3844 subinterpreters. */ 3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3846 return PyUnicode_Decode(s, size, 3847 Py_FileSystemDefaultEncoding, 3848 Py_FileSystemDefaultEncodeErrors); 3849 } 3850 else { 3851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); 3852 } 3853 #endif 3854 } 3855 3856 3857 int 3858 PyUnicode_FSConverter(PyObject* arg, void* addr) 3859 { 3860 PyObject *path = NULL; 3861 PyObject *output = NULL; 3862 Py_ssize_t size; 3863 void *data; 3864 if (arg == NULL) { 3865 Py_DECREF(*(PyObject**)addr); 3866 *(PyObject**)addr = NULL; 3867 return 1; 3868 } 3869 path = PyOS_FSPath(arg); 3870 if (path == NULL) { 3871 return 0; 3872 } 3873 if (PyBytes_Check(path)) { 3874 output = path; 3875 } 3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str. 3877 output = PyUnicode_EncodeFSDefault(path); 3878 Py_DECREF(path); 3879 if (!output) { 3880 return 0; 3881 } 3882 assert(PyBytes_Check(output)); 3883 } 3884 3885 size = PyBytes_GET_SIZE(output); 3886 data = PyBytes_AS_STRING(output); 3887 if ((size_t)size != strlen(data)) { 3888 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3889 Py_DECREF(output); 3890 return 0; 3891 } 3892 *(PyObject**)addr = output; 3893 return Py_CLEANUP_SUPPORTED; 3894 } 3895 3896 3897 int 3898 PyUnicode_FSDecoder(PyObject* arg, void* addr) 3899 { 3900 int is_buffer = 0; 3901 PyObject *path = NULL; 3902 PyObject *output = NULL; 3903 if (arg == NULL) { 3904 Py_DECREF(*(PyObject**)addr); 3905 return 1; 3906 } 3907 3908 is_buffer = PyObject_CheckBuffer(arg); 3909 if (!is_buffer) { 3910 path = PyOS_FSPath(arg); 3911 if (path == NULL) { 3912 return 0; 3913 } 3914 } 3915 else { 3916 path = arg; 3917 Py_INCREF(arg); 3918 } 3919 3920 if (PyUnicode_Check(path)) { 3921 if (PyUnicode_READY(path) == -1) { 3922 Py_DECREF(path); 3923 return 0; 3924 } 3925 output = path; 3926 } 3927 else if (PyBytes_Check(path) || is_buffer) { 3928 PyObject *path_bytes = NULL; 3929 3930 if (!PyBytes_Check(path) && 3931 PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 3932 "path should be string, bytes, or os.PathLike, not %.200s", 3933 Py_TYPE(arg)->tp_name)) { 3934 Py_DECREF(path); 3935 return 0; 3936 } 3937 path_bytes = PyBytes_FromObject(path); 3938 Py_DECREF(path); 3939 if (!path_bytes) { 3940 return 0; 3941 } 3942 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), 3943 PyBytes_GET_SIZE(path_bytes)); 3944 Py_DECREF(path_bytes); 3945 if (!output) { 3946 return 0; 3947 } 3948 } 3949 else { 3950 PyErr_Format(PyExc_TypeError, 3951 "path should be string, bytes, or os.PathLike, not %.200s", 3952 Py_TYPE(arg)->tp_name); 3953 Py_DECREF(path); 3954 return 0; 3955 } 3956 if (PyUnicode_READY(output) == -1) { 3957 Py_DECREF(output); 3958 return 0; 3959 } 3960 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3961 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3962 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3963 Py_DECREF(output); 3964 return 0; 3965 } 3966 *(PyObject**)addr = output; 3967 return Py_CLEANUP_SUPPORTED; 3968 } 3969 3970 3971 char* 3972 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3973 { 3974 PyObject *bytes; 3975 3976 if (!PyUnicode_Check(unicode)) { 3977 PyErr_BadArgument(); 3978 return NULL; 3979 } 3980 if (PyUnicode_READY(unicode) == -1) 3981 return NULL; 3982 3983 if (PyUnicode_UTF8(unicode) == NULL) { 3984 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3985 bytes = _PyUnicode_AsUTF8String(unicode, NULL); 3986 if (bytes == NULL) 3987 return NULL; 3988 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3989 if (_PyUnicode_UTF8(unicode) == NULL) { 3990 PyErr_NoMemory(); 3991 Py_DECREF(bytes); 3992 return NULL; 3993 } 3994 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3995 memcpy(_PyUnicode_UTF8(unicode), 3996 PyBytes_AS_STRING(bytes), 3997 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3998 Py_DECREF(bytes); 3999 } 4000 4001 if (psize) 4002 *psize = PyUnicode_UTF8_LENGTH(unicode); 4003 return PyUnicode_UTF8(unicode); 4004 } 4005 4006 char* 4007 PyUnicode_AsUTF8(PyObject *unicode) 4008 { 4009 return PyUnicode_AsUTF8AndSize(unicode, NULL); 4010 } 4011 4012 Py_UNICODE * 4013 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 4014 { 4015 const unsigned char *one_byte; 4016 #if SIZEOF_WCHAR_T == 4 4017 const Py_UCS2 *two_bytes; 4018 #else 4019 const Py_UCS4 *four_bytes; 4020 const Py_UCS4 *ucs4_end; 4021 Py_ssize_t num_surrogates; 4022 #endif 4023 wchar_t *w; 4024 wchar_t *wchar_end; 4025 4026 if (!PyUnicode_Check(unicode)) { 4027 PyErr_BadArgument(); 4028 return NULL; 4029 } 4030 if (_PyUnicode_WSTR(unicode) == NULL) { 4031 /* Non-ASCII compact unicode object */ 4032 assert(_PyUnicode_KIND(unicode) != 0); 4033 assert(PyUnicode_IS_READY(unicode)); 4034 4035 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 4036 #if SIZEOF_WCHAR_T == 2 4037 four_bytes = PyUnicode_4BYTE_DATA(unicode); 4038 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 4039 num_surrogates = 0; 4040 4041 for (; four_bytes < ucs4_end; ++four_bytes) { 4042 if (*four_bytes > 0xFFFF) 4043 ++num_surrogates; 4044 } 4045 4046 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 4047 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 4048 if (!_PyUnicode_WSTR(unicode)) { 4049 PyErr_NoMemory(); 4050 return NULL; 4051 } 4052 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 4053 4054 w = _PyUnicode_WSTR(unicode); 4055 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 4056 four_bytes = PyUnicode_4BYTE_DATA(unicode); 4057 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 4058 if (*four_bytes > 0xFFFF) { 4059 assert(*four_bytes <= MAX_UNICODE); 4060 /* encode surrogate pair in this case */ 4061 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 4062 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 4063 } 4064 else 4065 *w = *four_bytes; 4066 4067 if (w > wchar_end) { 4068 assert(0 && "Miscalculated string end"); 4069 } 4070 } 4071 *w = 0; 4072 #else 4073 /* sizeof(wchar_t) == 4 */ 4074 Py_FatalError("Impossible unicode object state, wstr and str " 4075 "should share memory already."); 4076 return NULL; 4077 #endif 4078 } 4079 else { 4080 if ((size_t)_PyUnicode_LENGTH(unicode) > 4081 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 4082 PyErr_NoMemory(); 4083 return NULL; 4084 } 4085 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 4086 (_PyUnicode_LENGTH(unicode) + 1)); 4087 if (!_PyUnicode_WSTR(unicode)) { 4088 PyErr_NoMemory(); 4089 return NULL; 4090 } 4091 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 4092 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 4093 w = _PyUnicode_WSTR(unicode); 4094 wchar_end = w + _PyUnicode_LENGTH(unicode); 4095 4096 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 4097 one_byte = PyUnicode_1BYTE_DATA(unicode); 4098 for (; w < wchar_end; ++one_byte, ++w) 4099 *w = *one_byte; 4100 /* null-terminate the wstr */ 4101 *w = 0; 4102 } 4103 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 4104 #if SIZEOF_WCHAR_T == 4 4105 two_bytes = PyUnicode_2BYTE_DATA(unicode); 4106 for (; w < wchar_end; ++two_bytes, ++w) 4107 *w = *two_bytes; 4108 /* null-terminate the wstr */ 4109 *w = 0; 4110 #else 4111 /* sizeof(wchar_t) == 2 */ 4112 PyObject_FREE(_PyUnicode_WSTR(unicode)); 4113 _PyUnicode_WSTR(unicode) = NULL; 4114 Py_FatalError("Impossible unicode object state, wstr " 4115 "and str should share memory already."); 4116 return NULL; 4117 #endif 4118 } 4119 else { 4120 assert(0 && "This should never happen."); 4121 } 4122 } 4123 } 4124 if (size != NULL) 4125 *size = PyUnicode_WSTR_LENGTH(unicode); 4126 return _PyUnicode_WSTR(unicode); 4127 } 4128 4129 Py_UNICODE * 4130 PyUnicode_AsUnicode(PyObject *unicode) 4131 { 4132 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 4133 } 4134 4135 4136 Py_ssize_t 4137 PyUnicode_GetSize(PyObject *unicode) 4138 { 4139 if (!PyUnicode_Check(unicode)) { 4140 PyErr_BadArgument(); 4141 goto onError; 4142 } 4143 return PyUnicode_GET_SIZE(unicode); 4144 4145 onError: 4146 return -1; 4147 } 4148 4149 Py_ssize_t 4150 PyUnicode_GetLength(PyObject *unicode) 4151 { 4152 if (!PyUnicode_Check(unicode)) { 4153 PyErr_BadArgument(); 4154 return -1; 4155 } 4156 if (PyUnicode_READY(unicode) == -1) 4157 return -1; 4158 return PyUnicode_GET_LENGTH(unicode); 4159 } 4160 4161 Py_UCS4 4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4163 { 4164 void *data; 4165 int kind; 4166 4167 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 4168 PyErr_BadArgument(); 4169 return (Py_UCS4)-1; 4170 } 4171 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4172 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4173 return (Py_UCS4)-1; 4174 } 4175 data = PyUnicode_DATA(unicode); 4176 kind = PyUnicode_KIND(unicode); 4177 return PyUnicode_READ(kind, data, index); 4178 } 4179 4180 int 4181 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4182 { 4183 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4184 PyErr_BadArgument(); 4185 return -1; 4186 } 4187 assert(PyUnicode_IS_READY(unicode)); 4188 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4189 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4190 return -1; 4191 } 4192 if (unicode_check_modifiable(unicode)) 4193 return -1; 4194 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4195 PyErr_SetString(PyExc_ValueError, "character out of range"); 4196 return -1; 4197 } 4198 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4199 index, ch); 4200 return 0; 4201 } 4202 4203 const char * 4204 PyUnicode_GetDefaultEncoding(void) 4205 { 4206 return "utf-8"; 4207 } 4208 4209 /* create or adjust a UnicodeDecodeError */ 4210 static void 4211 make_decode_exception(PyObject **exceptionObject, 4212 const char *encoding, 4213 const char *input, Py_ssize_t length, 4214 Py_ssize_t startpos, Py_ssize_t endpos, 4215 const char *reason) 4216 { 4217 if (*exceptionObject == NULL) { 4218 *exceptionObject = PyUnicodeDecodeError_Create( 4219 encoding, input, length, startpos, endpos, reason); 4220 } 4221 else { 4222 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4223 goto onError; 4224 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4225 goto onError; 4226 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4227 goto onError; 4228 } 4229 return; 4230 4231 onError: 4232 Py_CLEAR(*exceptionObject); 4233 } 4234 4235 #ifdef MS_WINDOWS 4236 /* error handling callback helper: 4237 build arguments, call the callback and check the arguments, 4238 if no exception occurred, copy the replacement to the output 4239 and adjust various state variables. 4240 return 0 on success, -1 on error 4241 */ 4242 4243 static int 4244 unicode_decode_call_errorhandler_wchar( 4245 const char *errors, PyObject **errorHandler, 4246 const char *encoding, const char *reason, 4247 const char **input, const char **inend, Py_ssize_t *startinpos, 4248 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4249 PyObject **output, Py_ssize_t *outpos) 4250 { 4251 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4252 4253 PyObject *restuple = NULL; 4254 PyObject *repunicode = NULL; 4255 Py_ssize_t outsize; 4256 Py_ssize_t insize; 4257 Py_ssize_t requiredsize; 4258 Py_ssize_t newpos; 4259 PyObject *inputobj = NULL; 4260 wchar_t *repwstr; 4261 Py_ssize_t repwlen; 4262 4263 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4264 outsize = _PyUnicode_WSTR_LENGTH(*output); 4265 4266 if (*errorHandler == NULL) { 4267 *errorHandler = PyCodec_LookupError(errors); 4268 if (*errorHandler == NULL) 4269 goto onError; 4270 } 4271 4272 make_decode_exception(exceptionObject, 4273 encoding, 4274 *input, *inend - *input, 4275 *startinpos, *endinpos, 4276 reason); 4277 if (*exceptionObject == NULL) 4278 goto onError; 4279 4280 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4281 if (restuple == NULL) 4282 goto onError; 4283 if (!PyTuple_Check(restuple)) { 4284 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4285 goto onError; 4286 } 4287 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4288 goto onError; 4289 4290 /* Copy back the bytes variables, which might have been modified by the 4291 callback */ 4292 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4293 if (!inputobj) 4294 goto onError; 4295 if (!PyBytes_Check(inputobj)) { 4296 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4297 } 4298 *input = PyBytes_AS_STRING(inputobj); 4299 insize = PyBytes_GET_SIZE(inputobj); 4300 *inend = *input + insize; 4301 /* we can DECREF safely, as the exception has another reference, 4302 so the object won't go away. */ 4303 Py_DECREF(inputobj); 4304 4305 if (newpos<0) 4306 newpos = insize+newpos; 4307 if (newpos<0 || newpos>insize) { 4308 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4309 goto onError; 4310 } 4311 4312 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4313 if (repwstr == NULL) 4314 goto onError; 4315 /* need more space? (at least enough for what we 4316 have+the replacement+the rest of the string (starting 4317 at the new input position), so we won't have to check space 4318 when there are no errors in the rest of the string) */ 4319 requiredsize = *outpos; 4320 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4321 goto overflow; 4322 requiredsize += repwlen; 4323 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4324 goto overflow; 4325 requiredsize += insize - newpos; 4326 if (requiredsize > outsize) { 4327 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4328 requiredsize = 2*outsize; 4329 if (unicode_resize(output, requiredsize) < 0) 4330 goto onError; 4331 } 4332 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4333 *outpos += repwlen; 4334 *endinpos = newpos; 4335 *inptr = *input + newpos; 4336 4337 /* we made it! */ 4338 Py_XDECREF(restuple); 4339 return 0; 4340 4341 overflow: 4342 PyErr_SetString(PyExc_OverflowError, 4343 "decoded result is too long for a Python string"); 4344 4345 onError: 4346 Py_XDECREF(restuple); 4347 return -1; 4348 } 4349 #endif /* MS_WINDOWS */ 4350 4351 static int 4352 unicode_decode_call_errorhandler_writer( 4353 const char *errors, PyObject **errorHandler, 4354 const char *encoding, const char *reason, 4355 const char **input, const char **inend, Py_ssize_t *startinpos, 4356 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4357 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4358 { 4359 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4360 4361 PyObject *restuple = NULL; 4362 PyObject *repunicode = NULL; 4363 Py_ssize_t insize; 4364 Py_ssize_t newpos; 4365 Py_ssize_t replen; 4366 PyObject *inputobj = NULL; 4367 4368 if (*errorHandler == NULL) { 4369 *errorHandler = PyCodec_LookupError(errors); 4370 if (*errorHandler == NULL) 4371 goto onError; 4372 } 4373 4374 make_decode_exception(exceptionObject, 4375 encoding, 4376 *input, *inend - *input, 4377 *startinpos, *endinpos, 4378 reason); 4379 if (*exceptionObject == NULL) 4380 goto onError; 4381 4382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4383 if (restuple == NULL) 4384 goto onError; 4385 if (!PyTuple_Check(restuple)) { 4386 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4387 goto onError; 4388 } 4389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4390 goto onError; 4391 4392 /* Copy back the bytes variables, which might have been modified by the 4393 callback */ 4394 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4395 if (!inputobj) 4396 goto onError; 4397 if (!PyBytes_Check(inputobj)) { 4398 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4399 } 4400 *input = PyBytes_AS_STRING(inputobj); 4401 insize = PyBytes_GET_SIZE(inputobj); 4402 *inend = *input + insize; 4403 /* we can DECREF safely, as the exception has another reference, 4404 so the object won't go away. */ 4405 Py_DECREF(inputobj); 4406 4407 if (newpos<0) 4408 newpos = insize+newpos; 4409 if (newpos<0 || newpos>insize) { 4410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4411 goto onError; 4412 } 4413 4414 if (PyUnicode_READY(repunicode) < 0) 4415 goto onError; 4416 replen = PyUnicode_GET_LENGTH(repunicode); 4417 if (replen > 1) { 4418 writer->min_length += replen - 1; 4419 writer->overallocate = 1; 4420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4422 goto onError; 4423 } 4424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4425 goto onError; 4426 4427 *endinpos = newpos; 4428 *inptr = *input + newpos; 4429 4430 /* we made it! */ 4431 Py_XDECREF(restuple); 4432 return 0; 4433 4434 onError: 4435 Py_XDECREF(restuple); 4436 return -1; 4437 } 4438 4439 /* --- UTF-7 Codec -------------------------------------------------------- */ 4440 4441 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4442 4443 /* Three simple macros defining base-64. */ 4444 4445 /* Is c a base-64 character? */ 4446 4447 #define IS_BASE64(c) \ 4448 (((c) >= 'A' && (c) <= 'Z') || \ 4449 ((c) >= 'a' && (c) <= 'z') || \ 4450 ((c) >= '0' && (c) <= '9') || \ 4451 (c) == '+' || (c) == '/') 4452 4453 /* given that c is a base-64 character, what is its base-64 value? */ 4454 4455 #define FROM_BASE64(c) \ 4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4459 (c) == '+' ? 62 : 63) 4460 4461 /* What is the base-64 character of the bottom 6 bits of n? */ 4462 4463 #define TO_BASE64(n) \ 4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4465 4466 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4467 * decoded as itself. We are permissive on decoding; the only ASCII 4468 * byte not decoding to itself is the + which begins a base64 4469 * string. */ 4470 4471 #define DECODE_DIRECT(c) \ 4472 ((c) <= 127 && (c) != '+') 4473 4474 /* The UTF-7 encoder treats ASCII characters differently according to 4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4476 * the above). See RFC2152. This array identifies these different 4477 * sets: 4478 * 0 : "Set D" 4479 * alphanumeric and '(),-./:? 4480 * 1 : "Set O" 4481 * !"#$%&*;<=>@[]^_`{|} 4482 * 2 : "whitespace" 4483 * ht nl cr sp 4484 * 3 : special (must be base64 encoded) 4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4486 */ 4487 4488 static 4489 char utf7_category[128] = { 4490 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4492 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4494 /* sp ! " # $ % & ' ( ) * + , - . / */ 4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4496 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4498 /* @ A B C D E F G H I J K L M N O */ 4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4500 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4502 /* ` a b c d e f g h i j k l m n o */ 4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4504 /* p q r s t u v w x y z { | } ~ del */ 4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4506 }; 4507 4508 /* ENCODE_DIRECT: this character should be encoded as itself. The 4509 * answer depends on whether we are encoding set O as itself, and also 4510 * on whether we are encoding whitespace as itself. RFC2152 makes it 4511 * clear that the answers to these questions vary between 4512 * applications, so this code needs to be flexible. */ 4513 4514 #define ENCODE_DIRECT(c, directO, directWS) \ 4515 ((c) < 128 && (c) > 0 && \ 4516 ((utf7_category[(c)] == 0) || \ 4517 (directWS && (utf7_category[(c)] == 2)) || \ 4518 (directO && (utf7_category[(c)] == 1)))) 4519 4520 PyObject * 4521 PyUnicode_DecodeUTF7(const char *s, 4522 Py_ssize_t size, 4523 const char *errors) 4524 { 4525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4526 } 4527 4528 /* The decoder. The only state we preserve is our read position, 4529 * i.e. how many characters we have consumed. So if we end in the 4530 * middle of a shift sequence we have to back off the read position 4531 * and the output to the beginning of the sequence, otherwise we lose 4532 * all the shift state (seen bits, number of bits seen, high 4533 * surrogate). */ 4534 4535 PyObject * 4536 PyUnicode_DecodeUTF7Stateful(const char *s, 4537 Py_ssize_t size, 4538 const char *errors, 4539 Py_ssize_t *consumed) 4540 { 4541 const char *starts = s; 4542 Py_ssize_t startinpos; 4543 Py_ssize_t endinpos; 4544 const char *e; 4545 _PyUnicodeWriter writer; 4546 const char *errmsg = ""; 4547 int inShift = 0; 4548 Py_ssize_t shiftOutStart; 4549 unsigned int base64bits = 0; 4550 unsigned long base64buffer = 0; 4551 Py_UCS4 surrogate = 0; 4552 PyObject *errorHandler = NULL; 4553 PyObject *exc = NULL; 4554 4555 if (size == 0) { 4556 if (consumed) 4557 *consumed = 0; 4558 _Py_RETURN_UNICODE_EMPTY(); 4559 } 4560 4561 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4562 _PyUnicodeWriter_Init(&writer); 4563 writer.min_length = size; 4564 4565 shiftOutStart = 0; 4566 e = s + size; 4567 4568 while (s < e) { 4569 Py_UCS4 ch; 4570 restart: 4571 ch = (unsigned char) *s; 4572 4573 if (inShift) { /* in a base-64 section */ 4574 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4576 base64bits += 6; 4577 s++; 4578 if (base64bits >= 16) { 4579 /* we have enough bits for a UTF-16 value */ 4580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4581 base64bits -= 16; 4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4583 assert(outCh <= 0xffff); 4584 if (surrogate) { 4585 /* expecting a second surrogate */ 4586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4589 goto onError; 4590 surrogate = 0; 4591 continue; 4592 } 4593 else { 4594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4595 goto onError; 4596 surrogate = 0; 4597 } 4598 } 4599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4600 /* first surrogate */ 4601 surrogate = outCh; 4602 } 4603 else { 4604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4605 goto onError; 4606 } 4607 } 4608 } 4609 else { /* now leaving a base-64 section */ 4610 inShift = 0; 4611 if (base64bits > 0) { /* left-over bits */ 4612 if (base64bits >= 6) { 4613 /* We've seen at least one base-64 character */ 4614 s++; 4615 errmsg = "partial character in shift sequence"; 4616 goto utf7Error; 4617 } 4618 else { 4619 /* Some bits remain; they should be zero */ 4620 if (base64buffer != 0) { 4621 s++; 4622 errmsg = "non-zero padding bits in shift sequence"; 4623 goto utf7Error; 4624 } 4625 } 4626 } 4627 if (surrogate && DECODE_DIRECT(ch)) { 4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4629 goto onError; 4630 } 4631 surrogate = 0; 4632 if (ch == '-') { 4633 /* '-' is absorbed; other terminating 4634 characters are preserved */ 4635 s++; 4636 } 4637 } 4638 } 4639 else if ( ch == '+' ) { 4640 startinpos = s-starts; 4641 s++; /* consume '+' */ 4642 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4643 s++; 4644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4645 goto onError; 4646 } 4647 else { /* begin base64-encoded section */ 4648 inShift = 1; 4649 surrogate = 0; 4650 shiftOutStart = writer.pos; 4651 base64bits = 0; 4652 base64buffer = 0; 4653 } 4654 } 4655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4656 s++; 4657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4658 goto onError; 4659 } 4660 else { 4661 startinpos = s-starts; 4662 s++; 4663 errmsg = "unexpected special character"; 4664 goto utf7Error; 4665 } 4666 continue; 4667 utf7Error: 4668 endinpos = s-starts; 4669 if (unicode_decode_call_errorhandler_writer( 4670 errors, &errorHandler, 4671 "utf7", errmsg, 4672 &starts, &e, &startinpos, &endinpos, &exc, &s, 4673 &writer)) 4674 goto onError; 4675 } 4676 4677 /* end of string */ 4678 4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4680 /* if we're in an inconsistent state, that's an error */ 4681 inShift = 0; 4682 if (surrogate || 4683 (base64bits >= 6) || 4684 (base64bits > 0 && base64buffer != 0)) { 4685 endinpos = size; 4686 if (unicode_decode_call_errorhandler_writer( 4687 errors, &errorHandler, 4688 "utf7", "unterminated shift sequence", 4689 &starts, &e, &startinpos, &endinpos, &exc, &s, 4690 &writer)) 4691 goto onError; 4692 if (s < e) 4693 goto restart; 4694 } 4695 } 4696 4697 /* return state */ 4698 if (consumed) { 4699 if (inShift) { 4700 *consumed = startinpos; 4701 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4702 PyObject *result = PyUnicode_FromKindAndData( 4703 writer.kind, writer.data, shiftOutStart); 4704 Py_XDECREF(errorHandler); 4705 Py_XDECREF(exc); 4706 _PyUnicodeWriter_Dealloc(&writer); 4707 return result; 4708 } 4709 writer.pos = shiftOutStart; /* back off output */ 4710 } 4711 else { 4712 *consumed = s-starts; 4713 } 4714 } 4715 4716 Py_XDECREF(errorHandler); 4717 Py_XDECREF(exc); 4718 return _PyUnicodeWriter_Finish(&writer); 4719 4720 onError: 4721 Py_XDECREF(errorHandler); 4722 Py_XDECREF(exc); 4723 _PyUnicodeWriter_Dealloc(&writer); 4724 return NULL; 4725 } 4726 4727 4728 PyObject * 4729 _PyUnicode_EncodeUTF7(PyObject *str, 4730 int base64SetO, 4731 int base64WhiteSpace, 4732 const char *errors) 4733 { 4734 int kind; 4735 void *data; 4736 Py_ssize_t len; 4737 PyObject *v; 4738 int inShift = 0; 4739 Py_ssize_t i; 4740 unsigned int base64bits = 0; 4741 unsigned long base64buffer = 0; 4742 char * out; 4743 char * start; 4744 4745 if (PyUnicode_READY(str) == -1) 4746 return NULL; 4747 kind = PyUnicode_KIND(str); 4748 data = PyUnicode_DATA(str); 4749 len = PyUnicode_GET_LENGTH(str); 4750 4751 if (len == 0) 4752 return PyBytes_FromStringAndSize(NULL, 0); 4753 4754 /* It might be possible to tighten this worst case */ 4755 if (len > PY_SSIZE_T_MAX / 8) 4756 return PyErr_NoMemory(); 4757 v = PyBytes_FromStringAndSize(NULL, len * 8); 4758 if (v == NULL) 4759 return NULL; 4760 4761 start = out = PyBytes_AS_STRING(v); 4762 for (i = 0; i < len; ++i) { 4763 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4764 4765 if (inShift) { 4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4767 /* shifting out */ 4768 if (base64bits) { /* output remaining bits */ 4769 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4770 base64buffer = 0; 4771 base64bits = 0; 4772 } 4773 inShift = 0; 4774 /* Characters not in the BASE64 set implicitly unshift the sequence 4775 so no '-' is required, except if the character is itself a '-' */ 4776 if (IS_BASE64(ch) || ch == '-') { 4777 *out++ = '-'; 4778 } 4779 *out++ = (char) ch; 4780 } 4781 else { 4782 goto encode_char; 4783 } 4784 } 4785 else { /* not in a shift sequence */ 4786 if (ch == '+') { 4787 *out++ = '+'; 4788 *out++ = '-'; 4789 } 4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4791 *out++ = (char) ch; 4792 } 4793 else { 4794 *out++ = '+'; 4795 inShift = 1; 4796 goto encode_char; 4797 } 4798 } 4799 continue; 4800 encode_char: 4801 if (ch >= 0x10000) { 4802 assert(ch <= MAX_UNICODE); 4803 4804 /* code first surrogate */ 4805 base64bits += 16; 4806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4807 while (base64bits >= 6) { 4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4809 base64bits -= 6; 4810 } 4811 /* prepare second surrogate */ 4812 ch = Py_UNICODE_LOW_SURROGATE(ch); 4813 } 4814 base64bits += 16; 4815 base64buffer = (base64buffer << 16) | ch; 4816 while (base64bits >= 6) { 4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4818 base64bits -= 6; 4819 } 4820 } 4821 if (base64bits) 4822 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4823 if (inShift) 4824 *out++ = '-'; 4825 if (_PyBytes_Resize(&v, out - start) < 0) 4826 return NULL; 4827 return v; 4828 } 4829 PyObject * 4830 PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4831 Py_ssize_t size, 4832 int base64SetO, 4833 int base64WhiteSpace, 4834 const char *errors) 4835 { 4836 PyObject *result; 4837 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4838 if (tmp == NULL) 4839 return NULL; 4840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4841 base64WhiteSpace, errors); 4842 Py_DECREF(tmp); 4843 return result; 4844 } 4845 4846 #undef IS_BASE64 4847 #undef FROM_BASE64 4848 #undef TO_BASE64 4849 #undef DECODE_DIRECT 4850 #undef ENCODE_DIRECT 4851 4852 /* --- UTF-8 Codec -------------------------------------------------------- */ 4853 4854 PyObject * 4855 PyUnicode_DecodeUTF8(const char *s, 4856 Py_ssize_t size, 4857 const char *errors) 4858 { 4859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4860 } 4861 4862 #include "stringlib/asciilib.h" 4863 #include "stringlib/codecs.h" 4864 #include "stringlib/undef.h" 4865 4866 #include "stringlib/ucs1lib.h" 4867 #include "stringlib/codecs.h" 4868 #include "stringlib/undef.h" 4869 4870 #include "stringlib/ucs2lib.h" 4871 #include "stringlib/codecs.h" 4872 #include "stringlib/undef.h" 4873 4874 #include "stringlib/ucs4lib.h" 4875 #include "stringlib/codecs.h" 4876 #include "stringlib/undef.h" 4877 4878 /* Mask to quickly check whether a C 'long' contains a 4879 non-ASCII, UTF8-encoded char. */ 4880 #if (SIZEOF_LONG == 8) 4881 # define ASCII_CHAR_MASK 0x8080808080808080UL 4882 #elif (SIZEOF_LONG == 4) 4883 # define ASCII_CHAR_MASK 0x80808080UL 4884 #else 4885 # error C 'long' size should be either 4 or 8! 4886 #endif 4887 4888 static Py_ssize_t 4889 ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4890 { 4891 const char *p = start; 4892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4893 4894 /* 4895 * Issue #17237: m68k is a bit different from most architectures in 4896 * that objects do not use "natural alignment" - for example, int and 4897 * long are only aligned at 2-byte boundaries. Therefore the assert() 4898 * won't work; also, tests have shown that skipping the "optimised 4899 * version" will even speed up m68k. 4900 */ 4901 #if !defined(__m68k__) 4902 #if SIZEOF_LONG <= SIZEOF_VOID_P 4903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4905 /* Fast path, see in STRINGLIB(utf8_decode) for 4906 an explanation. */ 4907 /* Help allocation */ 4908 const char *_p = p; 4909 Py_UCS1 * q = dest; 4910 while (_p < aligned_end) { 4911 unsigned long value = *(const unsigned long *) _p; 4912 if (value & ASCII_CHAR_MASK) 4913 break; 4914 *((unsigned long *)q) = value; 4915 _p += SIZEOF_LONG; 4916 q += SIZEOF_LONG; 4917 } 4918 p = _p; 4919 while (p < end) { 4920 if ((unsigned char)*p & 0x80) 4921 break; 4922 *q++ = *p++; 4923 } 4924 return p - start; 4925 } 4926 #endif 4927 #endif 4928 while (p < end) { 4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4930 for an explanation. */ 4931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4932 /* Help allocation */ 4933 const char *_p = p; 4934 while (_p < aligned_end) { 4935 unsigned long value = *(unsigned long *) _p; 4936 if (value & ASCII_CHAR_MASK) 4937 break; 4938 _p += SIZEOF_LONG; 4939 } 4940 p = _p; 4941 if (_p == end) 4942 break; 4943 } 4944 if ((unsigned char)*p & 0x80) 4945 break; 4946 ++p; 4947 } 4948 memcpy(dest, start, p - start); 4949 return p - start; 4950 } 4951 4952 PyObject * 4953 PyUnicode_DecodeUTF8Stateful(const char *s, 4954 Py_ssize_t size, 4955 const char *errors, 4956 Py_ssize_t *consumed) 4957 { 4958 _PyUnicodeWriter writer; 4959 const char *starts = s; 4960 const char *end = s + size; 4961 4962 Py_ssize_t startinpos; 4963 Py_ssize_t endinpos; 4964 const char *errmsg = ""; 4965 PyObject *error_handler_obj = NULL; 4966 PyObject *exc = NULL; 4967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 4968 4969 if (size == 0) { 4970 if (consumed) 4971 *consumed = 0; 4972 _Py_RETURN_UNICODE_EMPTY(); 4973 } 4974 4975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4976 if (size == 1 && (unsigned char)s[0] < 128) { 4977 if (consumed) 4978 *consumed = 1; 4979 return get_latin1_char((unsigned char)s[0]); 4980 } 4981 4982 _PyUnicodeWriter_Init(&writer); 4983 writer.min_length = size; 4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4985 goto onError; 4986 4987 writer.pos = ascii_decode(s, end, writer.data); 4988 s += writer.pos; 4989 while (s < end) { 4990 Py_UCS4 ch; 4991 int kind = writer.kind; 4992 4993 if (kind == PyUnicode_1BYTE_KIND) { 4994 if (PyUnicode_IS_ASCII(writer.buffer)) 4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4996 else 4997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4998 } else if (kind == PyUnicode_2BYTE_KIND) { 4999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 5000 } else { 5001 assert(kind == PyUnicode_4BYTE_KIND); 5002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 5003 } 5004 5005 switch (ch) { 5006 case 0: 5007 if (s == end || consumed) 5008 goto End; 5009 errmsg = "unexpected end of data"; 5010 startinpos = s - starts; 5011 endinpos = end - starts; 5012 break; 5013 case 1: 5014 errmsg = "invalid start byte"; 5015 startinpos = s - starts; 5016 endinpos = startinpos + 1; 5017 break; 5018 case 2: 5019 case 3: 5020 case 4: 5021 errmsg = "invalid continuation byte"; 5022 startinpos = s - starts; 5023 endinpos = startinpos + ch - 1; 5024 break; 5025 default: 5026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5027 goto onError; 5028 continue; 5029 } 5030 5031 if (error_handler == _Py_ERROR_UNKNOWN) 5032 error_handler = get_error_handler(errors); 5033 5034 switch (error_handler) { 5035 case _Py_ERROR_IGNORE: 5036 s += (endinpos - startinpos); 5037 break; 5038 5039 case _Py_ERROR_REPLACE: 5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) 5041 goto onError; 5042 s += (endinpos - startinpos); 5043 break; 5044 5045 case _Py_ERROR_SURROGATEESCAPE: 5046 { 5047 Py_ssize_t i; 5048 5049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 5050 goto onError; 5051 for (i=startinpos; i<endinpos; i++) { 5052 ch = (Py_UCS4)(unsigned char)(starts[i]); 5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, 5054 ch + 0xdc00); 5055 writer.pos++; 5056 } 5057 s += (endinpos - startinpos); 5058 break; 5059 } 5060 5061 default: 5062 if (unicode_decode_call_errorhandler_writer( 5063 errors, &error_handler_obj, 5064 "utf-8", errmsg, 5065 &starts, &end, &startinpos, &endinpos, &exc, &s, 5066 &writer)) 5067 goto onError; 5068 } 5069 } 5070 5071 End: 5072 if (consumed) 5073 *consumed = s - starts; 5074 5075 Py_XDECREF(error_handler_obj); 5076 Py_XDECREF(exc); 5077 return _PyUnicodeWriter_Finish(&writer); 5078 5079 onError: 5080 Py_XDECREF(error_handler_obj); 5081 Py_XDECREF(exc); 5082 _PyUnicodeWriter_Dealloc(&writer); 5083 return NULL; 5084 } 5085 5086 #if defined(__APPLE__) || defined(__ANDROID__) 5087 5088 /* Simplified UTF-8 decoder using surrogateescape error handler, 5089 used to decode the command line arguments on Mac OS X and Android. 5090 5091 Return a pointer to a newly allocated wide character string (use 5092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 5093 5094 wchar_t* 5095 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 5096 { 5097 const char *e; 5098 wchar_t *unicode; 5099 Py_ssize_t outpos; 5100 5101 /* Note: size will always be longer than the resulting Unicode 5102 character count */ 5103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) 5104 return NULL; 5105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 5106 if (!unicode) 5107 return NULL; 5108 5109 /* Unpack UTF-8 encoded data */ 5110 e = s + size; 5111 outpos = 0; 5112 while (s < e) { 5113 Py_UCS4 ch; 5114 #if SIZEOF_WCHAR_T == 4 5115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 5116 #else 5117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 5118 #endif 5119 if (ch > 0xFF) { 5120 #if SIZEOF_WCHAR_T == 4 5121 assert(0); 5122 #else 5123 assert(ch > 0xFFFF && ch <= MAX_UNICODE); 5124 /* compute and append the two surrogates: */ 5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 5127 #endif 5128 } 5129 else { 5130 if (!ch && s == e) 5131 break; 5132 /* surrogateescape */ 5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 5134 } 5135 } 5136 unicode[outpos] = L'\0'; 5137 return unicode; 5138 } 5139 5140 #endif /* __APPLE__ or __ANDROID__ */ 5141 5142 /* Primary internal function which creates utf8 encoded bytes objects. 5143 5144 Allocation strategy: if the string is short, convert into a stack buffer 5145 and allocate exactly as much space needed at the end. Else allocate the 5146 maximum possible needed (4 result bytes per Unicode character), and return 5147 the excess memory at the end. 5148 */ 5149 PyObject * 5150 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 5151 { 5152 enum PyUnicode_Kind kind; 5153 void *data; 5154 Py_ssize_t size; 5155 5156 if (!PyUnicode_Check(unicode)) { 5157 PyErr_BadArgument(); 5158 return NULL; 5159 } 5160 5161 if (PyUnicode_READY(unicode) == -1) 5162 return NULL; 5163 5164 if (PyUnicode_UTF8(unicode)) 5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 5166 PyUnicode_UTF8_LENGTH(unicode)); 5167 5168 kind = PyUnicode_KIND(unicode); 5169 data = PyUnicode_DATA(unicode); 5170 size = PyUnicode_GET_LENGTH(unicode); 5171 5172 switch (kind) { 5173 default: 5174 assert(0); 5175 case PyUnicode_1BYTE_KIND: 5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 5177 assert(!PyUnicode_IS_ASCII(unicode)); 5178 return ucs1lib_utf8_encoder(unicode, data, size, errors); 5179 case PyUnicode_2BYTE_KIND: 5180 return ucs2lib_utf8_encoder(unicode, data, size, errors); 5181 case PyUnicode_4BYTE_KIND: 5182 return ucs4lib_utf8_encoder(unicode, data, size, errors); 5183 } 5184 } 5185 5186 PyObject * 5187 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 5188 Py_ssize_t size, 5189 const char *errors) 5190 { 5191 PyObject *v, *unicode; 5192 5193 unicode = PyUnicode_FromUnicode(s, size); 5194 if (unicode == NULL) 5195 return NULL; 5196 v = _PyUnicode_AsUTF8String(unicode, errors); 5197 Py_DECREF(unicode); 5198 return v; 5199 } 5200 5201 PyObject * 5202 PyUnicode_AsUTF8String(PyObject *unicode) 5203 { 5204 return _PyUnicode_AsUTF8String(unicode, NULL); 5205 } 5206 5207 /* --- UTF-32 Codec ------------------------------------------------------- */ 5208 5209 PyObject * 5210 PyUnicode_DecodeUTF32(const char *s, 5211 Py_ssize_t size, 5212 const char *errors, 5213 int *byteorder) 5214 { 5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 5216 } 5217 5218 PyObject * 5219 PyUnicode_DecodeUTF32Stateful(const char *s, 5220 Py_ssize_t size, 5221 const char *errors, 5222 int *byteorder, 5223 Py_ssize_t *consumed) 5224 { 5225 const char *starts = s; 5226 Py_ssize_t startinpos; 5227 Py_ssize_t endinpos; 5228 _PyUnicodeWriter writer; 5229 const unsigned char *q, *e; 5230 int le, bo = 0; /* assume native ordering by default */ 5231 const char *encoding; 5232 const char *errmsg = ""; 5233 PyObject *errorHandler = NULL; 5234 PyObject *exc = NULL; 5235 5236 q = (unsigned char *)s; 5237 e = q + size; 5238 5239 if (byteorder) 5240 bo = *byteorder; 5241 5242 /* Check for BOM marks (U+FEFF) in the input and adjust current 5243 byte order setting accordingly. In native mode, the leading BOM 5244 mark is skipped, in all other modes, it is copied to the output 5245 stream as-is (giving a ZWNBSP character). */ 5246 if (bo == 0 && size >= 4) { 5247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5248 if (bom == 0x0000FEFF) { 5249 bo = -1; 5250 q += 4; 5251 } 5252 else if (bom == 0xFFFE0000) { 5253 bo = 1; 5254 q += 4; 5255 } 5256 if (byteorder) 5257 *byteorder = bo; 5258 } 5259 5260 if (q == e) { 5261 if (consumed) 5262 *consumed = size; 5263 _Py_RETURN_UNICODE_EMPTY(); 5264 } 5265 5266 #ifdef WORDS_BIGENDIAN 5267 le = bo < 0; 5268 #else 5269 le = bo <= 0; 5270 #endif 5271 encoding = le ? "utf-32-le" : "utf-32-be"; 5272 5273 _PyUnicodeWriter_Init(&writer); 5274 writer.min_length = (e - q + 3) / 4; 5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5276 goto onError; 5277 5278 while (1) { 5279 Py_UCS4 ch = 0; 5280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5281 5282 if (e - q >= 4) { 5283 enum PyUnicode_Kind kind = writer.kind; 5284 void *data = writer.data; 5285 const unsigned char *last = e - 4; 5286 Py_ssize_t pos = writer.pos; 5287 if (le) { 5288 do { 5289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5290 if (ch > maxch) 5291 break; 5292 if (kind != PyUnicode_1BYTE_KIND && 5293 Py_UNICODE_IS_SURROGATE(ch)) 5294 break; 5295 PyUnicode_WRITE(kind, data, pos++, ch); 5296 q += 4; 5297 } while (q <= last); 5298 } 5299 else { 5300 do { 5301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5302 if (ch > maxch) 5303 break; 5304 if (kind != PyUnicode_1BYTE_KIND && 5305 Py_UNICODE_IS_SURROGATE(ch)) 5306 break; 5307 PyUnicode_WRITE(kind, data, pos++, ch); 5308 q += 4; 5309 } while (q <= last); 5310 } 5311 writer.pos = pos; 5312 } 5313 5314 if (Py_UNICODE_IS_SURROGATE(ch)) { 5315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5316 startinpos = ((const char *)q) - starts; 5317 endinpos = startinpos + 4; 5318 } 5319 else if (ch <= maxch) { 5320 if (q == e || consumed) 5321 break; 5322 /* remaining bytes at the end? (size should be divisible by 4) */ 5323 errmsg = "truncated data"; 5324 startinpos = ((const char *)q) - starts; 5325 endinpos = ((const char *)e) - starts; 5326 } 5327 else { 5328 if (ch < 0x110000) { 5329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5330 goto onError; 5331 q += 4; 5332 continue; 5333 } 5334 errmsg = "code point not in range(0x110000)"; 5335 startinpos = ((const char *)q) - starts; 5336 endinpos = startinpos + 4; 5337 } 5338 5339 /* The remaining input chars are ignored if the callback 5340 chooses to skip the input */ 5341 if (unicode_decode_call_errorhandler_writer( 5342 errors, &errorHandler, 5343 encoding, errmsg, 5344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5345 &writer)) 5346 goto onError; 5347 } 5348 5349 if (consumed) 5350 *consumed = (const char *)q-starts; 5351 5352 Py_XDECREF(errorHandler); 5353 Py_XDECREF(exc); 5354 return _PyUnicodeWriter_Finish(&writer); 5355 5356 onError: 5357 _PyUnicodeWriter_Dealloc(&writer); 5358 Py_XDECREF(errorHandler); 5359 Py_XDECREF(exc); 5360 return NULL; 5361 } 5362 5363 PyObject * 5364 _PyUnicode_EncodeUTF32(PyObject *str, 5365 const char *errors, 5366 int byteorder) 5367 { 5368 enum PyUnicode_Kind kind; 5369 const void *data; 5370 Py_ssize_t len; 5371 PyObject *v; 5372 uint32_t *out; 5373 #if PY_LITTLE_ENDIAN 5374 int native_ordering = byteorder <= 0; 5375 #else 5376 int native_ordering = byteorder >= 0; 5377 #endif 5378 const char *encoding; 5379 Py_ssize_t nsize, pos; 5380 PyObject *errorHandler = NULL; 5381 PyObject *exc = NULL; 5382 PyObject *rep = NULL; 5383 5384 if (!PyUnicode_Check(str)) { 5385 PyErr_BadArgument(); 5386 return NULL; 5387 } 5388 if (PyUnicode_READY(str) == -1) 5389 return NULL; 5390 kind = PyUnicode_KIND(str); 5391 data = PyUnicode_DATA(str); 5392 len = PyUnicode_GET_LENGTH(str); 5393 5394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5395 return PyErr_NoMemory(); 5396 nsize = len + (byteorder == 0); 5397 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5398 if (v == NULL) 5399 return NULL; 5400 5401 /* output buffer is 4-bytes aligned */ 5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5403 out = (uint32_t *)PyBytes_AS_STRING(v); 5404 if (byteorder == 0) 5405 *out++ = 0xFEFF; 5406 if (len == 0) 5407 goto done; 5408 5409 if (byteorder == -1) 5410 encoding = "utf-32-le"; 5411 else if (byteorder == 1) 5412 encoding = "utf-32-be"; 5413 else 5414 encoding = "utf-32"; 5415 5416 if (kind == PyUnicode_1BYTE_KIND) { 5417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5418 goto done; 5419 } 5420 5421 pos = 0; 5422 while (pos < len) { 5423 Py_ssize_t repsize, moreunits; 5424 5425 if (kind == PyUnicode_2BYTE_KIND) { 5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5427 &out, native_ordering); 5428 } 5429 else { 5430 assert(kind == PyUnicode_4BYTE_KIND); 5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5432 &out, native_ordering); 5433 } 5434 if (pos == len) 5435 break; 5436 5437 rep = unicode_encode_call_errorhandler( 5438 errors, &errorHandler, 5439 encoding, "surrogates not allowed", 5440 str, &exc, pos, pos + 1, &pos); 5441 if (!rep) 5442 goto error; 5443 5444 if (PyBytes_Check(rep)) { 5445 repsize = PyBytes_GET_SIZE(rep); 5446 if (repsize & 3) { 5447 raise_encode_exception(&exc, encoding, 5448 str, pos - 1, pos, 5449 "surrogates not allowed"); 5450 goto error; 5451 } 5452 moreunits = repsize / 4; 5453 } 5454 else { 5455 assert(PyUnicode_Check(rep)); 5456 if (PyUnicode_READY(rep) < 0) 5457 goto error; 5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5459 if (!PyUnicode_IS_ASCII(rep)) { 5460 raise_encode_exception(&exc, encoding, 5461 str, pos - 1, pos, 5462 "surrogates not allowed"); 5463 goto error; 5464 } 5465 } 5466 5467 /* four bytes are reserved for each surrogate */ 5468 if (moreunits > 1) { 5469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); 5470 Py_ssize_t morebytes = 4 * (moreunits - 1); 5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5472 /* integer overflow */ 5473 PyErr_NoMemory(); 5474 goto error; 5475 } 5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5477 goto error; 5478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; 5479 } 5480 5481 if (PyBytes_Check(rep)) { 5482 memcpy(out, PyBytes_AS_STRING(rep), repsize); 5483 out += moreunits; 5484 } else /* rep is unicode */ { 5485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5487 &out, native_ordering); 5488 } 5489 5490 Py_CLEAR(rep); 5491 } 5492 5493 /* Cut back to size actually needed. This is necessary for, for example, 5494 encoding of a string containing isolated surrogates and the 'ignore' 5495 handler is used. */ 5496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5497 if (nsize != PyBytes_GET_SIZE(v)) 5498 _PyBytes_Resize(&v, nsize); 5499 Py_XDECREF(errorHandler); 5500 Py_XDECREF(exc); 5501 done: 5502 return v; 5503 error: 5504 Py_XDECREF(rep); 5505 Py_XDECREF(errorHandler); 5506 Py_XDECREF(exc); 5507 Py_XDECREF(v); 5508 return NULL; 5509 } 5510 5511 PyObject * 5512 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5513 Py_ssize_t size, 5514 const char *errors, 5515 int byteorder) 5516 { 5517 PyObject *result; 5518 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5519 if (tmp == NULL) 5520 return NULL; 5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5522 Py_DECREF(tmp); 5523 return result; 5524 } 5525 5526 PyObject * 5527 PyUnicode_AsUTF32String(PyObject *unicode) 5528 { 5529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5530 } 5531 5532 /* --- UTF-16 Codec ------------------------------------------------------- */ 5533 5534 PyObject * 5535 PyUnicode_DecodeUTF16(const char *s, 5536 Py_ssize_t size, 5537 const char *errors, 5538 int *byteorder) 5539 { 5540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5541 } 5542 5543 PyObject * 5544 PyUnicode_DecodeUTF16Stateful(const char *s, 5545 Py_ssize_t size, 5546 const char *errors, 5547 int *byteorder, 5548 Py_ssize_t *consumed) 5549 { 5550 const char *starts = s; 5551 Py_ssize_t startinpos; 5552 Py_ssize_t endinpos; 5553 _PyUnicodeWriter writer; 5554 const unsigned char *q, *e; 5555 int bo = 0; /* assume native ordering by default */ 5556 int native_ordering; 5557 const char *errmsg = ""; 5558 PyObject *errorHandler = NULL; 5559 PyObject *exc = NULL; 5560 const char *encoding; 5561 5562 q = (unsigned char *)s; 5563 e = q + size; 5564 5565 if (byteorder) 5566 bo = *byteorder; 5567 5568 /* Check for BOM marks (U+FEFF) in the input and adjust current 5569 byte order setting accordingly. In native mode, the leading BOM 5570 mark is skipped, in all other modes, it is copied to the output 5571 stream as-is (giving a ZWNBSP character). */ 5572 if (bo == 0 && size >= 2) { 5573 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5574 if (bom == 0xFEFF) { 5575 q += 2; 5576 bo = -1; 5577 } 5578 else if (bom == 0xFFFE) { 5579 q += 2; 5580 bo = 1; 5581 } 5582 if (byteorder) 5583 *byteorder = bo; 5584 } 5585 5586 if (q == e) { 5587 if (consumed) 5588 *consumed = size; 5589 _Py_RETURN_UNICODE_EMPTY(); 5590 } 5591 5592 #if PY_LITTLE_ENDIAN 5593 native_ordering = bo <= 0; 5594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5595 #else 5596 native_ordering = bo >= 0; 5597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5598 #endif 5599 5600 /* Note: size will always be longer than the resulting Unicode 5601 character count */ 5602 _PyUnicodeWriter_Init(&writer); 5603 writer.min_length = (e - q + 1) / 2; 5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5605 goto onError; 5606 5607 while (1) { 5608 Py_UCS4 ch = 0; 5609 if (e - q >= 2) { 5610 int kind = writer.kind; 5611 if (kind == PyUnicode_1BYTE_KIND) { 5612 if (PyUnicode_IS_ASCII(writer.buffer)) 5613 ch = asciilib_utf16_decode(&q, e, 5614 (Py_UCS1*)writer.data, &writer.pos, 5615 native_ordering); 5616 else 5617 ch = ucs1lib_utf16_decode(&q, e, 5618 (Py_UCS1*)writer.data, &writer.pos, 5619 native_ordering); 5620 } else if (kind == PyUnicode_2BYTE_KIND) { 5621 ch = ucs2lib_utf16_decode(&q, e, 5622 (Py_UCS2*)writer.data, &writer.pos, 5623 native_ordering); 5624 } else { 5625 assert(kind == PyUnicode_4BYTE_KIND); 5626 ch = ucs4lib_utf16_decode(&q, e, 5627 (Py_UCS4*)writer.data, &writer.pos, 5628 native_ordering); 5629 } 5630 } 5631 5632 switch (ch) 5633 { 5634 case 0: 5635 /* remaining byte at the end? (size should be even) */ 5636 if (q == e || consumed) 5637 goto End; 5638 errmsg = "truncated data"; 5639 startinpos = ((const char *)q) - starts; 5640 endinpos = ((const char *)e) - starts; 5641 break; 5642 /* The remaining input chars are ignored if the callback 5643 chooses to skip the input */ 5644 case 1: 5645 q -= 2; 5646 if (consumed) 5647 goto End; 5648 errmsg = "unexpected end of data"; 5649 startinpos = ((const char *)q) - starts; 5650 endinpos = ((const char *)e) - starts; 5651 break; 5652 case 2: 5653 errmsg = "illegal encoding"; 5654 startinpos = ((const char *)q) - 2 - starts; 5655 endinpos = startinpos + 2; 5656 break; 5657 case 3: 5658 errmsg = "illegal UTF-16 surrogate"; 5659 startinpos = ((const char *)q) - 4 - starts; 5660 endinpos = startinpos + 2; 5661 break; 5662 default: 5663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5664 goto onError; 5665 continue; 5666 } 5667 5668 if (unicode_decode_call_errorhandler_writer( 5669 errors, 5670 &errorHandler, 5671 encoding, errmsg, 5672 &starts, 5673 (const char **)&e, 5674 &startinpos, 5675 &endinpos, 5676 &exc, 5677 (const char **)&q, 5678 &writer)) 5679 goto onError; 5680 } 5681 5682 End: 5683 if (consumed) 5684 *consumed = (const char *)q-starts; 5685 5686 Py_XDECREF(errorHandler); 5687 Py_XDECREF(exc); 5688 return _PyUnicodeWriter_Finish(&writer); 5689 5690 onError: 5691 _PyUnicodeWriter_Dealloc(&writer); 5692 Py_XDECREF(errorHandler); 5693 Py_XDECREF(exc); 5694 return NULL; 5695 } 5696 5697 PyObject * 5698 _PyUnicode_EncodeUTF16(PyObject *str, 5699 const char *errors, 5700 int byteorder) 5701 { 5702 enum PyUnicode_Kind kind; 5703 const void *data; 5704 Py_ssize_t len; 5705 PyObject *v; 5706 unsigned short *out; 5707 Py_ssize_t pairs; 5708 #if PY_BIG_ENDIAN 5709 int native_ordering = byteorder >= 0; 5710 #else 5711 int native_ordering = byteorder <= 0; 5712 #endif 5713 const char *encoding; 5714 Py_ssize_t nsize, pos; 5715 PyObject *errorHandler = NULL; 5716 PyObject *exc = NULL; 5717 PyObject *rep = NULL; 5718 5719 if (!PyUnicode_Check(str)) { 5720 PyErr_BadArgument(); 5721 return NULL; 5722 } 5723 if (PyUnicode_READY(str) == -1) 5724 return NULL; 5725 kind = PyUnicode_KIND(str); 5726 data = PyUnicode_DATA(str); 5727 len = PyUnicode_GET_LENGTH(str); 5728 5729 pairs = 0; 5730 if (kind == PyUnicode_4BYTE_KIND) { 5731 const Py_UCS4 *in = (const Py_UCS4 *)data; 5732 const Py_UCS4 *end = in + len; 5733 while (in < end) { 5734 if (*in++ >= 0x10000) { 5735 pairs++; 5736 } 5737 } 5738 } 5739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { 5740 return PyErr_NoMemory(); 5741 } 5742 nsize = len + pairs + (byteorder == 0); 5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5744 if (v == NULL) { 5745 return NULL; 5746 } 5747 5748 /* output buffer is 2-bytes aligned */ 5749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5750 out = (unsigned short *)PyBytes_AS_STRING(v); 5751 if (byteorder == 0) { 5752 *out++ = 0xFEFF; 5753 } 5754 if (len == 0) { 5755 goto done; 5756 } 5757 5758 if (kind == PyUnicode_1BYTE_KIND) { 5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5760 goto done; 5761 } 5762 5763 if (byteorder < 0) { 5764 encoding = "utf-16-le"; 5765 } 5766 else if (byteorder > 0) { 5767 encoding = "utf-16-be"; 5768 } 5769 else { 5770 encoding = "utf-16"; 5771 } 5772 5773 pos = 0; 5774 while (pos < len) { 5775 Py_ssize_t repsize, moreunits; 5776 5777 if (kind == PyUnicode_2BYTE_KIND) { 5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5779 &out, native_ordering); 5780 } 5781 else { 5782 assert(kind == PyUnicode_4BYTE_KIND); 5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5784 &out, native_ordering); 5785 } 5786 if (pos == len) 5787 break; 5788 5789 rep = unicode_encode_call_errorhandler( 5790 errors, &errorHandler, 5791 encoding, "surrogates not allowed", 5792 str, &exc, pos, pos + 1, &pos); 5793 if (!rep) 5794 goto error; 5795 5796 if (PyBytes_Check(rep)) { 5797 repsize = PyBytes_GET_SIZE(rep); 5798 if (repsize & 1) { 5799 raise_encode_exception(&exc, encoding, 5800 str, pos - 1, pos, 5801 "surrogates not allowed"); 5802 goto error; 5803 } 5804 moreunits = repsize / 2; 5805 } 5806 else { 5807 assert(PyUnicode_Check(rep)); 5808 if (PyUnicode_READY(rep) < 0) 5809 goto error; 5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5811 if (!PyUnicode_IS_ASCII(rep)) { 5812 raise_encode_exception(&exc, encoding, 5813 str, pos - 1, pos, 5814 "surrogates not allowed"); 5815 goto error; 5816 } 5817 } 5818 5819 /* two bytes are reserved for each surrogate */ 5820 if (moreunits > 1) { 5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5822 Py_ssize_t morebytes = 2 * (moreunits - 1); 5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5824 /* integer overflow */ 5825 PyErr_NoMemory(); 5826 goto error; 5827 } 5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5829 goto error; 5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5831 } 5832 5833 if (PyBytes_Check(rep)) { 5834 memcpy(out, PyBytes_AS_STRING(rep), repsize); 5835 out += moreunits; 5836 } else /* rep is unicode */ { 5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5839 &out, native_ordering); 5840 } 5841 5842 Py_CLEAR(rep); 5843 } 5844 5845 /* Cut back to size actually needed. This is necessary for, for example, 5846 encoding of a string containing isolated surrogates and the 'ignore' handler 5847 is used. */ 5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5849 if (nsize != PyBytes_GET_SIZE(v)) 5850 _PyBytes_Resize(&v, nsize); 5851 Py_XDECREF(errorHandler); 5852 Py_XDECREF(exc); 5853 done: 5854 return v; 5855 error: 5856 Py_XDECREF(rep); 5857 Py_XDECREF(errorHandler); 5858 Py_XDECREF(exc); 5859 Py_XDECREF(v); 5860 return NULL; 5861 #undef STORECHAR 5862 } 5863 5864 PyObject * 5865 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5866 Py_ssize_t size, 5867 const char *errors, 5868 int byteorder) 5869 { 5870 PyObject *result; 5871 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5872 if (tmp == NULL) 5873 return NULL; 5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5875 Py_DECREF(tmp); 5876 return result; 5877 } 5878 5879 PyObject * 5880 PyUnicode_AsUTF16String(PyObject *unicode) 5881 { 5882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5883 } 5884 5885 /* --- Unicode Escape Codec ----------------------------------------------- */ 5886 5887 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5888 5889 PyObject * 5890 _PyUnicode_DecodeUnicodeEscape(const char *s, 5891 Py_ssize_t size, 5892 const char *errors, 5893 const char **first_invalid_escape) 5894 { 5895 const char *starts = s; 5896 _PyUnicodeWriter writer; 5897 const char *end; 5898 PyObject *errorHandler = NULL; 5899 PyObject *exc = NULL; 5900 5901 // so we can remember if we've seen an invalid escape char or not 5902 *first_invalid_escape = NULL; 5903 5904 if (size == 0) { 5905 _Py_RETURN_UNICODE_EMPTY(); 5906 } 5907 /* Escaped strings will always be longer than the resulting 5908 Unicode string, so we start with size here and then reduce the 5909 length after conversion to the true value. 5910 (but if the error callback returns a long replacement string 5911 we'll have to allocate more space) */ 5912 _PyUnicodeWriter_Init(&writer); 5913 writer.min_length = size; 5914 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 5915 goto onError; 5916 } 5917 5918 end = s + size; 5919 while (s < end) { 5920 unsigned char c = (unsigned char) *s++; 5921 Py_UCS4 ch; 5922 int count; 5923 Py_ssize_t startinpos; 5924 Py_ssize_t endinpos; 5925 const char *message; 5926 5927 #define WRITE_ASCII_CHAR(ch) \ 5928 do { \ 5929 assert(ch <= 127); \ 5930 assert(writer.pos < writer.size); \ 5931 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5932 } while(0) 5933 5934 #define WRITE_CHAR(ch) \ 5935 do { \ 5936 if (ch <= writer.maxchar) { \ 5937 assert(writer.pos < writer.size); \ 5938 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5939 } \ 5940 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 5941 goto onError; \ 5942 } \ 5943 } while(0) 5944 5945 /* Non-escape characters are interpreted as Unicode ordinals */ 5946 if (c != '\\') { 5947 WRITE_CHAR(c); 5948 continue; 5949 } 5950 5951 startinpos = s - starts - 1; 5952 /* \ - Escapes */ 5953 if (s >= end) { 5954 message = "\\ at end of string"; 5955 goto error; 5956 } 5957 c = (unsigned char) *s++; 5958 5959 assert(writer.pos < writer.size); 5960 switch (c) { 5961 5962 /* \x escapes */ 5963 case '\n': continue; 5964 case '\\': WRITE_ASCII_CHAR('\\'); continue; 5965 case '\'': WRITE_ASCII_CHAR('\''); continue; 5966 case '\"': WRITE_ASCII_CHAR('\"'); continue; 5967 case 'b': WRITE_ASCII_CHAR('\b'); continue; 5968 /* FF */ 5969 case 'f': WRITE_ASCII_CHAR('\014'); continue; 5970 case 't': WRITE_ASCII_CHAR('\t'); continue; 5971 case 'n': WRITE_ASCII_CHAR('\n'); continue; 5972 case 'r': WRITE_ASCII_CHAR('\r'); continue; 5973 /* VT */ 5974 case 'v': WRITE_ASCII_CHAR('\013'); continue; 5975 /* BEL, not classic C */ 5976 case 'a': WRITE_ASCII_CHAR('\007'); continue; 5977 5978 /* \OOO (octal) escapes */ 5979 case '0': case '1': case '2': case '3': 5980 case '4': case '5': case '6': case '7': 5981 ch = c - '0'; 5982 if (s < end && '0' <= *s && *s <= '7') { 5983 ch = (ch<<3) + *s++ - '0'; 5984 if (s < end && '0' <= *s && *s <= '7') { 5985 ch = (ch<<3) + *s++ - '0'; 5986 } 5987 } 5988 WRITE_CHAR(ch); 5989 continue; 5990 5991 /* hex escapes */ 5992 /* \xXX */ 5993 case 'x': 5994 count = 2; 5995 message = "truncated \\xXX escape"; 5996 goto hexescape; 5997 5998 /* \uXXXX */ 5999 case 'u': 6000 count = 4; 6001 message = "truncated \\uXXXX escape"; 6002 goto hexescape; 6003 6004 /* \UXXXXXXXX */ 6005 case 'U': 6006 count = 8; 6007 message = "truncated \\UXXXXXXXX escape"; 6008 hexescape: 6009 for (ch = 0; count && s < end; ++s, --count) { 6010 c = (unsigned char)*s; 6011 ch <<= 4; 6012 if (c >= '0' && c <= '9') { 6013 ch += c - '0'; 6014 } 6015 else if (c >= 'a' && c <= 'f') { 6016 ch += c - ('a' - 10); 6017 } 6018 else if (c >= 'A' && c <= 'F') { 6019 ch += c - ('A' - 10); 6020 } 6021 else { 6022 break; 6023 } 6024 } 6025 if (count) { 6026 goto error; 6027 } 6028 6029 /* when we get here, ch is a 32-bit unicode character */ 6030 if (ch > MAX_UNICODE) { 6031 message = "illegal Unicode character"; 6032 goto error; 6033 } 6034 6035 WRITE_CHAR(ch); 6036 continue; 6037 6038 /* \N{name} */ 6039 case 'N': 6040 if (ucnhash_CAPI == NULL) { 6041 /* load the unicode data module */ 6042 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 6043 PyUnicodeData_CAPSULE_NAME, 1); 6044 if (ucnhash_CAPI == NULL) { 6045 PyErr_SetString( 6046 PyExc_UnicodeError, 6047 "\\N escapes not supported (can't load unicodedata module)" 6048 ); 6049 goto onError; 6050 } 6051 } 6052 6053 message = "malformed \\N character escape"; 6054 if (*s == '{') { 6055 const char *start = ++s; 6056 size_t namelen; 6057 /* look for the closing brace */ 6058 while (s < end && *s != '}') 6059 s++; 6060 namelen = s - start; 6061 if (namelen && s < end) { 6062 /* found a name. look it up in the unicode database */ 6063 s++; 6064 ch = 0xffffffff; /* in case 'getcode' messes up */ 6065 if (namelen <= INT_MAX && 6066 ucnhash_CAPI->getcode(NULL, start, (int)namelen, 6067 &ch, 0)) { 6068 assert(ch <= MAX_UNICODE); 6069 WRITE_CHAR(ch); 6070 continue; 6071 } 6072 message = "unknown Unicode character name"; 6073 } 6074 } 6075 goto error; 6076 6077 default: 6078 if (*first_invalid_escape == NULL) { 6079 *first_invalid_escape = s-1; /* Back up one char, since we've 6080 already incremented s. */ 6081 } 6082 WRITE_ASCII_CHAR('\\'); 6083 WRITE_CHAR(c); 6084 continue; 6085 } 6086 6087 error: 6088 endinpos = s-starts; 6089 writer.min_length = end - s + writer.pos; 6090 if (unicode_decode_call_errorhandler_writer( 6091 errors, &errorHandler, 6092 "unicodeescape", message, 6093 &starts, &end, &startinpos, &endinpos, &exc, &s, 6094 &writer)) { 6095 goto onError; 6096 } 6097 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { 6098 goto onError; 6099 } 6100 6101 #undef WRITE_ASCII_CHAR 6102 #undef WRITE_CHAR 6103 } 6104 6105 Py_XDECREF(errorHandler); 6106 Py_XDECREF(exc); 6107 return _PyUnicodeWriter_Finish(&writer); 6108 6109 onError: 6110 _PyUnicodeWriter_Dealloc(&writer); 6111 Py_XDECREF(errorHandler); 6112 Py_XDECREF(exc); 6113 return NULL; 6114 } 6115 6116 PyObject * 6117 PyUnicode_DecodeUnicodeEscape(const char *s, 6118 Py_ssize_t size, 6119 const char *errors) 6120 { 6121 const char *first_invalid_escape; 6122 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, 6123 &first_invalid_escape); 6124 if (result == NULL) 6125 return NULL; 6126 if (first_invalid_escape != NULL) { 6127 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 6128 "invalid escape sequence '\\%c'", 6129 *first_invalid_escape) < 0) { 6130 Py_DECREF(result); 6131 return NULL; 6132 } 6133 } 6134 return result; 6135 } 6136 6137 /* Return a Unicode-Escape string version of the Unicode object. */ 6138 6139 PyObject * 6140 PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6141 { 6142 Py_ssize_t i, len; 6143 PyObject *repr; 6144 char *p; 6145 enum PyUnicode_Kind kind; 6146 void *data; 6147 Py_ssize_t expandsize; 6148 6149 /* Initial allocation is based on the longest-possible character 6150 escape. 6151 6152 For UCS1 strings it's '\xxx', 4 bytes per source character. 6153 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 6154 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 6155 */ 6156 6157 if (!PyUnicode_Check(unicode)) { 6158 PyErr_BadArgument(); 6159 return NULL; 6160 } 6161 if (PyUnicode_READY(unicode) == -1) { 6162 return NULL; 6163 } 6164 6165 len = PyUnicode_GET_LENGTH(unicode); 6166 if (len == 0) { 6167 return PyBytes_FromStringAndSize(NULL, 0); 6168 } 6169 6170 kind = PyUnicode_KIND(unicode); 6171 data = PyUnicode_DATA(unicode); 6172 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6173 bytes, and 1 byte characters 4. */ 6174 expandsize = kind * 2 + 2; 6175 if (len > PY_SSIZE_T_MAX / expandsize) { 6176 return PyErr_NoMemory(); 6177 } 6178 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6179 if (repr == NULL) { 6180 return NULL; 6181 } 6182 6183 p = PyBytes_AS_STRING(repr); 6184 for (i = 0; i < len; i++) { 6185 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6186 6187 /* U+0000-U+00ff range */ 6188 if (ch < 0x100) { 6189 if (ch >= ' ' && ch < 127) { 6190 if (ch != '\\') { 6191 /* Copy printable US ASCII as-is */ 6192 *p++ = (char) ch; 6193 } 6194 /* Escape backslashes */ 6195 else { 6196 *p++ = '\\'; 6197 *p++ = '\\'; 6198 } 6199 } 6200 6201 /* Map special whitespace to '\t', \n', '\r' */ 6202 else if (ch == '\t') { 6203 *p++ = '\\'; 6204 *p++ = 't'; 6205 } 6206 else if (ch == '\n') { 6207 *p++ = '\\'; 6208 *p++ = 'n'; 6209 } 6210 else if (ch == '\r') { 6211 *p++ = '\\'; 6212 *p++ = 'r'; 6213 } 6214 6215 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */ 6216 else { 6217 *p++ = '\\'; 6218 *p++ = 'x'; 6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6220 *p++ = Py_hexdigits[ch & 0x000F]; 6221 } 6222 } 6223 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */ 6224 else if (ch < 0x10000) { 6225 *p++ = '\\'; 6226 *p++ = 'u'; 6227 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6228 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6230 *p++ = Py_hexdigits[ch & 0x000F]; 6231 } 6232 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */ 6233 else { 6234 6235 /* Make sure that the first two digits are zero */ 6236 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6237 *p++ = '\\'; 6238 *p++ = 'U'; 6239 *p++ = '0'; 6240 *p++ = '0'; 6241 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 6242 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 6243 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 6244 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 6245 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 6246 *p++ = Py_hexdigits[ch & 0x0000000F]; 6247 } 6248 } 6249 6250 assert(p - PyBytes_AS_STRING(repr) > 0); 6251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6252 return NULL; 6253 } 6254 return repr; 6255 } 6256 6257 PyObject * 6258 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6259 Py_ssize_t size) 6260 { 6261 PyObject *result; 6262 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6263 if (tmp == NULL) { 6264 return NULL; 6265 } 6266 6267 result = PyUnicode_AsUnicodeEscapeString(tmp); 6268 Py_DECREF(tmp); 6269 return result; 6270 } 6271 6272 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 6273 6274 PyObject * 6275 PyUnicode_DecodeRawUnicodeEscape(const char *s, 6276 Py_ssize_t size, 6277 const char *errors) 6278 { 6279 const char *starts = s; 6280 _PyUnicodeWriter writer; 6281 const char *end; 6282 PyObject *errorHandler = NULL; 6283 PyObject *exc = NULL; 6284 6285 if (size == 0) { 6286 _Py_RETURN_UNICODE_EMPTY(); 6287 } 6288 6289 /* Escaped strings will always be longer than the resulting 6290 Unicode string, so we start with size here and then reduce the 6291 length after conversion to the true value. (But decoding error 6292 handler might have to resize the string) */ 6293 _PyUnicodeWriter_Init(&writer); 6294 writer.min_length = size; 6295 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 6296 goto onError; 6297 } 6298 6299 end = s + size; 6300 while (s < end) { 6301 unsigned char c = (unsigned char) *s++; 6302 Py_UCS4 ch; 6303 int count; 6304 Py_ssize_t startinpos; 6305 Py_ssize_t endinpos; 6306 const char *message; 6307 6308 #define WRITE_CHAR(ch) \ 6309 do { \ 6310 if (ch <= writer.maxchar) { \ 6311 assert(writer.pos < writer.size); \ 6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6313 } \ 6314 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 6315 goto onError; \ 6316 } \ 6317 } while(0) 6318 6319 /* Non-escape characters are interpreted as Unicode ordinals */ 6320 if (c != '\\' || s >= end) { 6321 WRITE_CHAR(c); 6322 continue; 6323 } 6324 6325 c = (unsigned char) *s++; 6326 if (c == 'u') { 6327 count = 4; 6328 message = "truncated \\uXXXX escape"; 6329 } 6330 else if (c == 'U') { 6331 count = 8; 6332 message = "truncated \\UXXXXXXXX escape"; 6333 } 6334 else { 6335 assert(writer.pos < writer.size); 6336 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\'); 6337 WRITE_CHAR(c); 6338 continue; 6339 } 6340 startinpos = s - starts - 2; 6341 6342 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ 6343 for (ch = 0; count && s < end; ++s, --count) { 6344 c = (unsigned char)*s; 6345 ch <<= 4; 6346 if (c >= '0' && c <= '9') { 6347 ch += c - '0'; 6348 } 6349 else if (c >= 'a' && c <= 'f') { 6350 ch += c - ('a' - 10); 6351 } 6352 else if (c >= 'A' && c <= 'F') { 6353 ch += c - ('A' - 10); 6354 } 6355 else { 6356 break; 6357 } 6358 } 6359 if (!count) { 6360 if (ch <= MAX_UNICODE) { 6361 WRITE_CHAR(ch); 6362 continue; 6363 } 6364 message = "\\Uxxxxxxxx out of range"; 6365 } 6366 6367 endinpos = s-starts; 6368 writer.min_length = end - s + writer.pos; 6369 if (unicode_decode_call_errorhandler_writer( 6370 errors, &errorHandler, 6371 "rawunicodeescape", message, 6372 &starts, &end, &startinpos, &endinpos, &exc, &s, 6373 &writer)) { 6374 goto onError; 6375 } 6376 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { 6377 goto onError; 6378 } 6379 6380 #undef WRITE_CHAR 6381 } 6382 Py_XDECREF(errorHandler); 6383 Py_XDECREF(exc); 6384 return _PyUnicodeWriter_Finish(&writer); 6385 6386 onError: 6387 _PyUnicodeWriter_Dealloc(&writer); 6388 Py_XDECREF(errorHandler); 6389 Py_XDECREF(exc); 6390 return NULL; 6391 6392 } 6393 6394 6395 PyObject * 6396 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6397 { 6398 PyObject *repr; 6399 char *p; 6400 Py_ssize_t expandsize, pos; 6401 int kind; 6402 void *data; 6403 Py_ssize_t len; 6404 6405 if (!PyUnicode_Check(unicode)) { 6406 PyErr_BadArgument(); 6407 return NULL; 6408 } 6409 if (PyUnicode_READY(unicode) == -1) { 6410 return NULL; 6411 } 6412 kind = PyUnicode_KIND(unicode); 6413 data = PyUnicode_DATA(unicode); 6414 len = PyUnicode_GET_LENGTH(unicode); 6415 if (kind == PyUnicode_1BYTE_KIND) { 6416 return PyBytes_FromStringAndSize(data, len); 6417 } 6418 6419 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6420 bytes, and 1 byte characters 4. */ 6421 expandsize = kind * 2 + 2; 6422 6423 if (len > PY_SSIZE_T_MAX / expandsize) { 6424 return PyErr_NoMemory(); 6425 } 6426 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6427 if (repr == NULL) { 6428 return NULL; 6429 } 6430 if (len == 0) { 6431 return repr; 6432 } 6433 6434 p = PyBytes_AS_STRING(repr); 6435 for (pos = 0; pos < len; pos++) { 6436 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6437 6438 /* U+0000-U+00ff range: Copy 8-bit characters as-is */ 6439 if (ch < 0x100) { 6440 *p++ = (char) ch; 6441 } 6442 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */ 6443 else if (ch < 0x10000) { 6444 *p++ = '\\'; 6445 *p++ = 'u'; 6446 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6447 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6448 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6449 *p++ = Py_hexdigits[ch & 15]; 6450 } 6451 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */ 6452 else { 6453 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6454 *p++ = '\\'; 6455 *p++ = 'U'; 6456 *p++ = '0'; 6457 *p++ = '0'; 6458 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6459 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6460 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6463 *p++ = Py_hexdigits[ch & 15]; 6464 } 6465 } 6466 6467 assert(p > PyBytes_AS_STRING(repr)); 6468 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6469 return NULL; 6470 } 6471 return repr; 6472 } 6473 6474 PyObject * 6475 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6476 Py_ssize_t size) 6477 { 6478 PyObject *result; 6479 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6480 if (tmp == NULL) 6481 return NULL; 6482 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6483 Py_DECREF(tmp); 6484 return result; 6485 } 6486 6487 /* --- Unicode Internal Codec ------------------------------------------- */ 6488 6489 PyObject * 6490 _PyUnicode_DecodeUnicodeInternal(const char *s, 6491 Py_ssize_t size, 6492 const char *errors) 6493 { 6494 const char *starts = s; 6495 Py_ssize_t startinpos; 6496 Py_ssize_t endinpos; 6497 _PyUnicodeWriter writer; 6498 const char *end; 6499 const char *reason; 6500 PyObject *errorHandler = NULL; 6501 PyObject *exc = NULL; 6502 6503 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6504 "unicode_internal codec has been deprecated", 6505 1)) 6506 return NULL; 6507 6508 if (size == 0) 6509 _Py_RETURN_UNICODE_EMPTY(); 6510 6511 _PyUnicodeWriter_Init(&writer); 6512 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6513 PyErr_NoMemory(); 6514 goto onError; 6515 } 6516 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6517 6518 end = s + size; 6519 while (s < end) { 6520 Py_UNICODE uch; 6521 Py_UCS4 ch; 6522 if (end - s < Py_UNICODE_SIZE) { 6523 endinpos = end-starts; 6524 reason = "truncated input"; 6525 goto error; 6526 } 6527 /* We copy the raw representation one byte at a time because the 6528 pointer may be unaligned (see test_codeccallbacks). */ 6529 ((char *) &uch)[0] = s[0]; 6530 ((char *) &uch)[1] = s[1]; 6531 #ifdef Py_UNICODE_WIDE 6532 ((char *) &uch)[2] = s[2]; 6533 ((char *) &uch)[3] = s[3]; 6534 #endif 6535 ch = uch; 6536 #ifdef Py_UNICODE_WIDE 6537 /* We have to sanity check the raw data, otherwise doom looms for 6538 some malformed UCS-4 data. */ 6539 if (ch > 0x10ffff) { 6540 endinpos = s - starts + Py_UNICODE_SIZE; 6541 reason = "illegal code point (> 0x10FFFF)"; 6542 goto error; 6543 } 6544 #endif 6545 s += Py_UNICODE_SIZE; 6546 #ifndef Py_UNICODE_WIDE 6547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6548 { 6549 Py_UNICODE uch2; 6550 ((char *) &uch2)[0] = s[0]; 6551 ((char *) &uch2)[1] = s[1]; 6552 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6553 { 6554 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6555 s += Py_UNICODE_SIZE; 6556 } 6557 } 6558 #endif 6559 6560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6561 goto onError; 6562 continue; 6563 6564 error: 6565 startinpos = s - starts; 6566 if (unicode_decode_call_errorhandler_writer( 6567 errors, &errorHandler, 6568 "unicode_internal", reason, 6569 &starts, &end, &startinpos, &endinpos, &exc, &s, 6570 &writer)) 6571 goto onError; 6572 } 6573 6574 Py_XDECREF(errorHandler); 6575 Py_XDECREF(exc); 6576 return _PyUnicodeWriter_Finish(&writer); 6577 6578 onError: 6579 _PyUnicodeWriter_Dealloc(&writer); 6580 Py_XDECREF(errorHandler); 6581 Py_XDECREF(exc); 6582 return NULL; 6583 } 6584 6585 /* --- Latin-1 Codec ------------------------------------------------------ */ 6586 6587 PyObject * 6588 PyUnicode_DecodeLatin1(const char *s, 6589 Py_ssize_t size, 6590 const char *errors) 6591 { 6592 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6593 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6594 } 6595 6596 /* create or adjust a UnicodeEncodeError */ 6597 static void 6598 make_encode_exception(PyObject **exceptionObject, 6599 const char *encoding, 6600 PyObject *unicode, 6601 Py_ssize_t startpos, Py_ssize_t endpos, 6602 const char *reason) 6603 { 6604 if (*exceptionObject == NULL) { 6605 *exceptionObject = PyObject_CallFunction( 6606 PyExc_UnicodeEncodeError, "sOnns", 6607 encoding, unicode, startpos, endpos, reason); 6608 } 6609 else { 6610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6611 goto onError; 6612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6613 goto onError; 6614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6615 goto onError; 6616 return; 6617 onError: 6618 Py_CLEAR(*exceptionObject); 6619 } 6620 } 6621 6622 /* raises a UnicodeEncodeError */ 6623 static void 6624 raise_encode_exception(PyObject **exceptionObject, 6625 const char *encoding, 6626 PyObject *unicode, 6627 Py_ssize_t startpos, Py_ssize_t endpos, 6628 const char *reason) 6629 { 6630 make_encode_exception(exceptionObject, 6631 encoding, unicode, startpos, endpos, reason); 6632 if (*exceptionObject != NULL) 6633 PyCodec_StrictErrors(*exceptionObject); 6634 } 6635 6636 /* error handling callback helper: 6637 build arguments, call the callback and check the arguments, 6638 put the result into newpos and return the replacement string, which 6639 has to be freed by the caller */ 6640 static PyObject * 6641 unicode_encode_call_errorhandler(const char *errors, 6642 PyObject **errorHandler, 6643 const char *encoding, const char *reason, 6644 PyObject *unicode, PyObject **exceptionObject, 6645 Py_ssize_t startpos, Py_ssize_t endpos, 6646 Py_ssize_t *newpos) 6647 { 6648 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6649 Py_ssize_t len; 6650 PyObject *restuple; 6651 PyObject *resunicode; 6652 6653 if (*errorHandler == NULL) { 6654 *errorHandler = PyCodec_LookupError(errors); 6655 if (*errorHandler == NULL) 6656 return NULL; 6657 } 6658 6659 if (PyUnicode_READY(unicode) == -1) 6660 return NULL; 6661 len = PyUnicode_GET_LENGTH(unicode); 6662 6663 make_encode_exception(exceptionObject, 6664 encoding, unicode, startpos, endpos, reason); 6665 if (*exceptionObject == NULL) 6666 return NULL; 6667 6668 restuple = PyObject_CallFunctionObjArgs( 6669 *errorHandler, *exceptionObject, NULL); 6670 if (restuple == NULL) 6671 return NULL; 6672 if (!PyTuple_Check(restuple)) { 6673 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6674 Py_DECREF(restuple); 6675 return NULL; 6676 } 6677 if (!PyArg_ParseTuple(restuple, argparse, 6678 &resunicode, newpos)) { 6679 Py_DECREF(restuple); 6680 return NULL; 6681 } 6682 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6683 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6684 Py_DECREF(restuple); 6685 return NULL; 6686 } 6687 if (*newpos<0) 6688 *newpos = len + *newpos; 6689 if (*newpos<0 || *newpos>len) { 6690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6691 Py_DECREF(restuple); 6692 return NULL; 6693 } 6694 Py_INCREF(resunicode); 6695 Py_DECREF(restuple); 6696 return resunicode; 6697 } 6698 6699 static PyObject * 6700 unicode_encode_ucs1(PyObject *unicode, 6701 const char *errors, 6702 const Py_UCS4 limit) 6703 { 6704 /* input state */ 6705 Py_ssize_t pos=0, size; 6706 int kind; 6707 void *data; 6708 /* pointer into the output */ 6709 char *str; 6710 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6711 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6712 PyObject *error_handler_obj = NULL; 6713 PyObject *exc = NULL; 6714 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6715 PyObject *rep = NULL; 6716 /* output object */ 6717 _PyBytesWriter writer; 6718 6719 if (PyUnicode_READY(unicode) == -1) 6720 return NULL; 6721 size = PyUnicode_GET_LENGTH(unicode); 6722 kind = PyUnicode_KIND(unicode); 6723 data = PyUnicode_DATA(unicode); 6724 /* allocate enough for a simple encoding without 6725 replacements, if we need more, we'll resize */ 6726 if (size == 0) 6727 return PyBytes_FromStringAndSize(NULL, 0); 6728 6729 _PyBytesWriter_Init(&writer); 6730 str = _PyBytesWriter_Alloc(&writer, size); 6731 if (str == NULL) 6732 return NULL; 6733 6734 while (pos < size) { 6735 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6736 6737 /* can we encode this? */ 6738 if (ch < limit) { 6739 /* no overflow check, because we know that the space is enough */ 6740 *str++ = (char)ch; 6741 ++pos; 6742 } 6743 else { 6744 Py_ssize_t newpos, i; 6745 /* startpos for collecting unencodable chars */ 6746 Py_ssize_t collstart = pos; 6747 Py_ssize_t collend = collstart + 1; 6748 /* find all unecodable characters */ 6749 6750 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6751 ++collend; 6752 6753 /* Only overallocate the buffer if it's not the last write */ 6754 writer.overallocate = (collend < size); 6755 6756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6757 if (error_handler == _Py_ERROR_UNKNOWN) 6758 error_handler = get_error_handler(errors); 6759 6760 switch (error_handler) { 6761 case _Py_ERROR_STRICT: 6762 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6763 goto onError; 6764 6765 case _Py_ERROR_REPLACE: 6766 memset(str, '?', collend - collstart); 6767 str += (collend - collstart); 6768 /* fall through ignore error handler */ 6769 case _Py_ERROR_IGNORE: 6770 pos = collend; 6771 break; 6772 6773 case _Py_ERROR_BACKSLASHREPLACE: 6774 /* subtract preallocated bytes */ 6775 writer.min_size -= (collend - collstart); 6776 str = backslashreplace(&writer, str, 6777 unicode, collstart, collend); 6778 if (str == NULL) 6779 goto onError; 6780 pos = collend; 6781 break; 6782 6783 case _Py_ERROR_XMLCHARREFREPLACE: 6784 /* subtract preallocated bytes */ 6785 writer.min_size -= (collend - collstart); 6786 str = xmlcharrefreplace(&writer, str, 6787 unicode, collstart, collend); 6788 if (str == NULL) 6789 goto onError; 6790 pos = collend; 6791 break; 6792 6793 case _Py_ERROR_SURROGATEESCAPE: 6794 for (i = collstart; i < collend; ++i) { 6795 ch = PyUnicode_READ(kind, data, i); 6796 if (ch < 0xdc80 || 0xdcff < ch) { 6797 /* Not a UTF-8b surrogate */ 6798 break; 6799 } 6800 *str++ = (char)(ch - 0xdc00); 6801 ++pos; 6802 } 6803 if (i >= collend) 6804 break; 6805 collstart = pos; 6806 assert(collstart != collend); 6807 /* fallback to general error handling */ 6808 6809 default: 6810 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, 6811 encoding, reason, unicode, &exc, 6812 collstart, collend, &newpos); 6813 if (rep == NULL) 6814 goto onError; 6815 6816 /* subtract preallocated bytes */ 6817 writer.min_size -= 1; 6818 6819 if (PyBytes_Check(rep)) { 6820 /* Directly copy bytes result to output. */ 6821 str = _PyBytesWriter_WriteBytes(&writer, str, 6822 PyBytes_AS_STRING(rep), 6823 PyBytes_GET_SIZE(rep)); 6824 if (str == NULL) 6825 goto onError; 6826 } 6827 else { 6828 assert(PyUnicode_Check(rep)); 6829 6830 if (PyUnicode_READY(rep) < 0) 6831 goto onError; 6832 6833 if (PyUnicode_IS_ASCII(rep)) { 6834 /* Fast path: all characters are smaller than limit */ 6835 assert(limit >= 128); 6836 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 6837 str = _PyBytesWriter_WriteBytes(&writer, str, 6838 PyUnicode_DATA(rep), 6839 PyUnicode_GET_LENGTH(rep)); 6840 } 6841 else { 6842 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); 6843 6844 str = _PyBytesWriter_Prepare(&writer, str, repsize); 6845 if (str == NULL) 6846 goto onError; 6847 6848 /* check if there is anything unencodable in the 6849 replacement and copy it to the output */ 6850 for (i = 0; repsize-->0; ++i, ++str) { 6851 ch = PyUnicode_READ_CHAR(rep, i); 6852 if (ch >= limit) { 6853 raise_encode_exception(&exc, encoding, unicode, 6854 pos, pos+1, reason); 6855 goto onError; 6856 } 6857 *str = (char)ch; 6858 } 6859 } 6860 } 6861 pos = newpos; 6862 Py_CLEAR(rep); 6863 } 6864 6865 /* If overallocation was disabled, ensure that it was the last 6866 write. Otherwise, we missed an optimization */ 6867 assert(writer.overallocate || pos == size); 6868 } 6869 } 6870 6871 Py_XDECREF(error_handler_obj); 6872 Py_XDECREF(exc); 6873 return _PyBytesWriter_Finish(&writer, str); 6874 6875 onError: 6876 Py_XDECREF(rep); 6877 _PyBytesWriter_Dealloc(&writer); 6878 Py_XDECREF(error_handler_obj); 6879 Py_XDECREF(exc); 6880 return NULL; 6881 } 6882 6883 /* Deprecated */ 6884 PyObject * 6885 PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6886 Py_ssize_t size, 6887 const char *errors) 6888 { 6889 PyObject *result; 6890 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6891 if (unicode == NULL) 6892 return NULL; 6893 result = unicode_encode_ucs1(unicode, errors, 256); 6894 Py_DECREF(unicode); 6895 return result; 6896 } 6897 6898 PyObject * 6899 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6900 { 6901 if (!PyUnicode_Check(unicode)) { 6902 PyErr_BadArgument(); 6903 return NULL; 6904 } 6905 if (PyUnicode_READY(unicode) == -1) 6906 return NULL; 6907 /* Fast path: if it is a one-byte string, construct 6908 bytes object directly. */ 6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6911 PyUnicode_GET_LENGTH(unicode)); 6912 /* Non-Latin-1 characters present. Defer to above function to 6913 raise the exception. */ 6914 return unicode_encode_ucs1(unicode, errors, 256); 6915 } 6916 6917 PyObject* 6918 PyUnicode_AsLatin1String(PyObject *unicode) 6919 { 6920 return _PyUnicode_AsLatin1String(unicode, NULL); 6921 } 6922 6923 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 6924 6925 PyObject * 6926 PyUnicode_DecodeASCII(const char *s, 6927 Py_ssize_t size, 6928 const char *errors) 6929 { 6930 const char *starts = s; 6931 _PyUnicodeWriter writer; 6932 int kind; 6933 void *data; 6934 Py_ssize_t startinpos; 6935 Py_ssize_t endinpos; 6936 Py_ssize_t outpos; 6937 const char *e; 6938 PyObject *error_handler_obj = NULL; 6939 PyObject *exc = NULL; 6940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6941 6942 if (size == 0) 6943 _Py_RETURN_UNICODE_EMPTY(); 6944 6945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6946 if (size == 1 && (unsigned char)s[0] < 128) 6947 return get_latin1_char((unsigned char)s[0]); 6948 6949 _PyUnicodeWriter_Init(&writer); 6950 writer.min_length = size; 6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6952 return NULL; 6953 6954 e = s + size; 6955 data = writer.data; 6956 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6957 writer.pos = outpos; 6958 if (writer.pos == size) 6959 return _PyUnicodeWriter_Finish(&writer); 6960 6961 s += writer.pos; 6962 kind = writer.kind; 6963 while (s < e) { 6964 unsigned char c = (unsigned char)*s; 6965 if (c < 128) { 6966 PyUnicode_WRITE(kind, data, writer.pos, c); 6967 writer.pos++; 6968 ++s; 6969 continue; 6970 } 6971 6972 /* byte outsize range 0x00..0x7f: call the error handler */ 6973 6974 if (error_handler == _Py_ERROR_UNKNOWN) 6975 error_handler = get_error_handler(errors); 6976 6977 switch (error_handler) 6978 { 6979 case _Py_ERROR_REPLACE: 6980 case _Py_ERROR_SURROGATEESCAPE: 6981 /* Fast-path: the error handler only writes one character, 6982 but we may switch to UCS2 at the first write */ 6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 6984 goto onError; 6985 kind = writer.kind; 6986 data = writer.data; 6987 6988 if (error_handler == _Py_ERROR_REPLACE) 6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 6990 else 6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 6992 writer.pos++; 6993 ++s; 6994 break; 6995 6996 case _Py_ERROR_IGNORE: 6997 ++s; 6998 break; 6999 7000 default: 7001 startinpos = s-starts; 7002 endinpos = startinpos + 1; 7003 if (unicode_decode_call_errorhandler_writer( 7004 errors, &error_handler_obj, 7005 "ascii", "ordinal not in range(128)", 7006 &starts, &e, &startinpos, &endinpos, &exc, &s, 7007 &writer)) 7008 goto onError; 7009 kind = writer.kind; 7010 data = writer.data; 7011 } 7012 } 7013 Py_XDECREF(error_handler_obj); 7014 Py_XDECREF(exc); 7015 return _PyUnicodeWriter_Finish(&writer); 7016 7017 onError: 7018 _PyUnicodeWriter_Dealloc(&writer); 7019 Py_XDECREF(error_handler_obj); 7020 Py_XDECREF(exc); 7021 return NULL; 7022 } 7023 7024 /* Deprecated */ 7025 PyObject * 7026 PyUnicode_EncodeASCII(const Py_UNICODE *p, 7027 Py_ssize_t size, 7028 const char *errors) 7029 { 7030 PyObject *result; 7031 PyObject *unicode = PyUnicode_FromUnicode(p, size); 7032 if (unicode == NULL) 7033 return NULL; 7034 result = unicode_encode_ucs1(unicode, errors, 128); 7035 Py_DECREF(unicode); 7036 return result; 7037 } 7038 7039 PyObject * 7040 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 7041 { 7042 if (!PyUnicode_Check(unicode)) { 7043 PyErr_BadArgument(); 7044 return NULL; 7045 } 7046 if (PyUnicode_READY(unicode) == -1) 7047 return NULL; 7048 /* Fast path: if it is an ASCII-only string, construct bytes object 7049 directly. Else defer to above function to raise the exception. */ 7050 if (PyUnicode_IS_ASCII(unicode)) 7051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 7052 PyUnicode_GET_LENGTH(unicode)); 7053 return unicode_encode_ucs1(unicode, errors, 128); 7054 } 7055 7056 PyObject * 7057 PyUnicode_AsASCIIString(PyObject *unicode) 7058 { 7059 return _PyUnicode_AsASCIIString(unicode, NULL); 7060 } 7061 7062 #ifdef MS_WINDOWS 7063 7064 /* --- MBCS codecs for Windows -------------------------------------------- */ 7065 7066 #if SIZEOF_INT < SIZEOF_SIZE_T 7067 #define NEED_RETRY 7068 #endif 7069 7070 #ifndef WC_ERR_INVALID_CHARS 7071 # define WC_ERR_INVALID_CHARS 0x0080 7072 #endif 7073 7074 static const char* 7075 code_page_name(UINT code_page, PyObject **obj) 7076 { 7077 *obj = NULL; 7078 if (code_page == CP_ACP) 7079 return "mbcs"; 7080 if (code_page == CP_UTF7) 7081 return "CP_UTF7"; 7082 if (code_page == CP_UTF8) 7083 return "CP_UTF8"; 7084 7085 *obj = PyBytes_FromFormat("cp%u", code_page); 7086 if (*obj == NULL) 7087 return NULL; 7088 return PyBytes_AS_STRING(*obj); 7089 } 7090 7091 static DWORD 7092 decode_code_page_flags(UINT code_page) 7093 { 7094 if (code_page == CP_UTF7) { 7095 /* The CP_UTF7 decoder only supports flags=0 */ 7096 return 0; 7097 } 7098 else 7099 return MB_ERR_INVALID_CHARS; 7100 } 7101 7102 /* 7103 * Decode a byte string from a Windows code page into unicode object in strict 7104 * mode. 7105 * 7106 * Returns consumed size if succeed, returns -2 on decode error, or raise an 7107 * OSError and returns -1 on other error. 7108 */ 7109 static int 7110 decode_code_page_strict(UINT code_page, 7111 PyObject **v, 7112 const char *in, 7113 int insize) 7114 { 7115 const DWORD flags = decode_code_page_flags(code_page); 7116 wchar_t *out; 7117 DWORD outsize; 7118 7119 /* First get the size of the result */ 7120 assert(insize > 0); 7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 7122 if (outsize <= 0) 7123 goto error; 7124 7125 if (*v == NULL) { 7126 /* Create unicode object */ 7127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7128 *v = (PyObject*)_PyUnicode_New(outsize); 7129 if (*v == NULL) 7130 return -1; 7131 out = PyUnicode_AS_UNICODE(*v); 7132 } 7133 else { 7134 /* Extend unicode object */ 7135 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7136 if (unicode_resize(v, n + outsize) < 0) 7137 return -1; 7138 out = PyUnicode_AS_UNICODE(*v) + n; 7139 } 7140 7141 /* Do the conversion */ 7142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 7143 if (outsize <= 0) 7144 goto error; 7145 return insize; 7146 7147 error: 7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7149 return -2; 7150 PyErr_SetFromWindowsErr(0); 7151 return -1; 7152 } 7153 7154 /* 7155 * Decode a byte string from a code page into unicode object with an error 7156 * handler. 7157 * 7158 * Returns consumed size if succeed, or raise an OSError or 7159 * UnicodeDecodeError exception and returns -1 on error. 7160 */ 7161 static int 7162 decode_code_page_errors(UINT code_page, 7163 PyObject **v, 7164 const char *in, const int size, 7165 const char *errors, int final) 7166 { 7167 const char *startin = in; 7168 const char *endin = in + size; 7169 const DWORD flags = decode_code_page_flags(code_page); 7170 /* Ideally, we should get reason from FormatMessage. This is the Windows 7171 2000 English version of the message. */ 7172 const char *reason = "No mapping for the Unicode character exists " 7173 "in the target code page."; 7174 /* each step cannot decode more than 1 character, but a character can be 7175 represented as a surrogate pair */ 7176 wchar_t buffer[2], *startout, *out; 7177 int insize; 7178 Py_ssize_t outsize; 7179 PyObject *errorHandler = NULL; 7180 PyObject *exc = NULL; 7181 PyObject *encoding_obj = NULL; 7182 const char *encoding; 7183 DWORD err; 7184 int ret = -1; 7185 7186 assert(size > 0); 7187 7188 encoding = code_page_name(code_page, &encoding_obj); 7189 if (encoding == NULL) 7190 return -1; 7191 7192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 7193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 7194 UnicodeDecodeError. */ 7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 7196 if (exc != NULL) { 7197 PyCodec_StrictErrors(exc); 7198 Py_CLEAR(exc); 7199 } 7200 goto error; 7201 } 7202 7203 if (*v == NULL) { 7204 /* Create unicode object */ 7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7206 PyErr_NoMemory(); 7207 goto error; 7208 } 7209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 7211 if (*v == NULL) 7212 goto error; 7213 startout = PyUnicode_AS_UNICODE(*v); 7214 } 7215 else { 7216 /* Extend unicode object */ 7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7219 PyErr_NoMemory(); 7220 goto error; 7221 } 7222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7223 goto error; 7224 startout = PyUnicode_AS_UNICODE(*v) + n; 7225 } 7226 7227 /* Decode the byte string character per character */ 7228 out = startout; 7229 while (in < endin) 7230 { 7231 /* Decode a character */ 7232 insize = 1; 7233 do 7234 { 7235 outsize = MultiByteToWideChar(code_page, flags, 7236 in, insize, 7237 buffer, Py_ARRAY_LENGTH(buffer)); 7238 if (outsize > 0) 7239 break; 7240 err = GetLastError(); 7241 if (err != ERROR_NO_UNICODE_TRANSLATION 7242 && err != ERROR_INSUFFICIENT_BUFFER) 7243 { 7244 PyErr_SetFromWindowsErr(0); 7245 goto error; 7246 } 7247 insize++; 7248 } 7249 /* 4=maximum length of a UTF-8 sequence */ 7250 while (insize <= 4 && (in + insize) <= endin); 7251 7252 if (outsize <= 0) { 7253 Py_ssize_t startinpos, endinpos, outpos; 7254 7255 /* last character in partial decode? */ 7256 if (in + insize >= endin && !final) 7257 break; 7258 7259 startinpos = in - startin; 7260 endinpos = startinpos + 1; 7261 outpos = out - PyUnicode_AS_UNICODE(*v); 7262 if (unicode_decode_call_errorhandler_wchar( 7263 errors, &errorHandler, 7264 encoding, reason, 7265 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7266 v, &outpos)) 7267 { 7268 goto error; 7269 } 7270 out = PyUnicode_AS_UNICODE(*v) + outpos; 7271 } 7272 else { 7273 in += insize; 7274 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7275 out += outsize; 7276 } 7277 } 7278 7279 /* write a NUL character at the end */ 7280 *out = 0; 7281 7282 /* Extend unicode object */ 7283 outsize = out - startout; 7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7285 if (unicode_resize(v, outsize) < 0) 7286 goto error; 7287 /* (in - startin) <= size and size is an int */ 7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7289 7290 error: 7291 Py_XDECREF(encoding_obj); 7292 Py_XDECREF(errorHandler); 7293 Py_XDECREF(exc); 7294 return ret; 7295 } 7296 7297 static PyObject * 7298 decode_code_page_stateful(int code_page, 7299 const char *s, Py_ssize_t size, 7300 const char *errors, Py_ssize_t *consumed) 7301 { 7302 PyObject *v = NULL; 7303 int chunk_size, final, converted, done; 7304 7305 if (code_page < 0) { 7306 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7307 return NULL; 7308 } 7309 7310 if (consumed) 7311 *consumed = 0; 7312 7313 do 7314 { 7315 #ifdef NEED_RETRY 7316 if (size > INT_MAX) { 7317 chunk_size = INT_MAX; 7318 final = 0; 7319 done = 0; 7320 } 7321 else 7322 #endif 7323 { 7324 chunk_size = (int)size; 7325 final = (consumed == NULL); 7326 done = 1; 7327 } 7328 7329 if (chunk_size == 0 && done) { 7330 if (v != NULL) 7331 break; 7332 _Py_RETURN_UNICODE_EMPTY(); 7333 } 7334 7335 converted = decode_code_page_strict(code_page, &v, 7336 s, chunk_size); 7337 if (converted == -2) 7338 converted = decode_code_page_errors(code_page, &v, 7339 s, chunk_size, 7340 errors, final); 7341 assert(converted != 0 || done); 7342 7343 if (converted < 0) { 7344 Py_XDECREF(v); 7345 return NULL; 7346 } 7347 7348 if (consumed) 7349 *consumed += converted; 7350 7351 s += converted; 7352 size -= converted; 7353 } while (!done); 7354 7355 return unicode_result(v); 7356 } 7357 7358 PyObject * 7359 PyUnicode_DecodeCodePageStateful(int code_page, 7360 const char *s, 7361 Py_ssize_t size, 7362 const char *errors, 7363 Py_ssize_t *consumed) 7364 { 7365 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7366 } 7367 7368 PyObject * 7369 PyUnicode_DecodeMBCSStateful(const char *s, 7370 Py_ssize_t size, 7371 const char *errors, 7372 Py_ssize_t *consumed) 7373 { 7374 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7375 } 7376 7377 PyObject * 7378 PyUnicode_DecodeMBCS(const char *s, 7379 Py_ssize_t size, 7380 const char *errors) 7381 { 7382 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7383 } 7384 7385 static DWORD 7386 encode_code_page_flags(UINT code_page, const char *errors) 7387 { 7388 if (code_page == CP_UTF8) { 7389 return WC_ERR_INVALID_CHARS; 7390 } 7391 else if (code_page == CP_UTF7) { 7392 /* CP_UTF7 only supports flags=0 */ 7393 return 0; 7394 } 7395 else { 7396 if (errors != NULL && strcmp(errors, "replace") == 0) 7397 return 0; 7398 else 7399 return WC_NO_BEST_FIT_CHARS; 7400 } 7401 } 7402 7403 /* 7404 * Encode a Unicode string to a Windows code page into a byte string in strict 7405 * mode. 7406 * 7407 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7408 * an OSError and returns -1 on other error. 7409 */ 7410 static int 7411 encode_code_page_strict(UINT code_page, PyObject **outbytes, 7412 PyObject *unicode, Py_ssize_t offset, int len, 7413 const char* errors) 7414 { 7415 BOOL usedDefaultChar = FALSE; 7416 BOOL *pusedDefaultChar = &usedDefaultChar; 7417 int outsize; 7418 wchar_t *p; 7419 Py_ssize_t size; 7420 const DWORD flags = encode_code_page_flags(code_page, NULL); 7421 char *out; 7422 /* Create a substring so that we can get the UTF-16 representation 7423 of just the slice under consideration. */ 7424 PyObject *substring; 7425 7426 assert(len > 0); 7427 7428 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7429 pusedDefaultChar = &usedDefaultChar; 7430 else 7431 pusedDefaultChar = NULL; 7432 7433 substring = PyUnicode_Substring(unicode, offset, offset+len); 7434 if (substring == NULL) 7435 return -1; 7436 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7437 if (p == NULL) { 7438 Py_DECREF(substring); 7439 return -1; 7440 } 7441 assert(size <= INT_MAX); 7442 7443 /* First get the size of the result */ 7444 outsize = WideCharToMultiByte(code_page, flags, 7445 p, (int)size, 7446 NULL, 0, 7447 NULL, pusedDefaultChar); 7448 if (outsize <= 0) 7449 goto error; 7450 /* If we used a default char, then we failed! */ 7451 if (pusedDefaultChar && *pusedDefaultChar) { 7452 Py_DECREF(substring); 7453 return -2; 7454 } 7455 7456 if (*outbytes == NULL) { 7457 /* Create string object */ 7458 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7459 if (*outbytes == NULL) { 7460 Py_DECREF(substring); 7461 return -1; 7462 } 7463 out = PyBytes_AS_STRING(*outbytes); 7464 } 7465 else { 7466 /* Extend string object */ 7467 const Py_ssize_t n = PyBytes_Size(*outbytes); 7468 if (outsize > PY_SSIZE_T_MAX - n) { 7469 PyErr_NoMemory(); 7470 Py_DECREF(substring); 7471 return -1; 7472 } 7473 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7474 Py_DECREF(substring); 7475 return -1; 7476 } 7477 out = PyBytes_AS_STRING(*outbytes) + n; 7478 } 7479 7480 /* Do the conversion */ 7481 outsize = WideCharToMultiByte(code_page, flags, 7482 p, (int)size, 7483 out, outsize, 7484 NULL, pusedDefaultChar); 7485 Py_CLEAR(substring); 7486 if (outsize <= 0) 7487 goto error; 7488 if (pusedDefaultChar && *pusedDefaultChar) 7489 return -2; 7490 return 0; 7491 7492 error: 7493 Py_XDECREF(substring); 7494 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7495 return -2; 7496 PyErr_SetFromWindowsErr(0); 7497 return -1; 7498 } 7499 7500 /* 7501 * Encode a Unicode string to a Windows code page into a byte string using an 7502 * error handler. 7503 * 7504 * Returns consumed characters if succeed, or raise an OSError and returns 7505 * -1 on other error. 7506 */ 7507 static int 7508 encode_code_page_errors(UINT code_page, PyObject **outbytes, 7509 PyObject *unicode, Py_ssize_t unicode_offset, 7510 Py_ssize_t insize, const char* errors) 7511 { 7512 const DWORD flags = encode_code_page_flags(code_page, errors); 7513 Py_ssize_t pos = unicode_offset; 7514 Py_ssize_t endin = unicode_offset + insize; 7515 /* Ideally, we should get reason from FormatMessage. This is the Windows 7516 2000 English version of the message. */ 7517 const char *reason = "invalid character"; 7518 /* 4=maximum length of a UTF-8 sequence */ 7519 char buffer[4]; 7520 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7521 Py_ssize_t outsize; 7522 char *out; 7523 PyObject *errorHandler = NULL; 7524 PyObject *exc = NULL; 7525 PyObject *encoding_obj = NULL; 7526 const char *encoding; 7527 Py_ssize_t newpos, newoutsize; 7528 PyObject *rep; 7529 int ret = -1; 7530 7531 assert(insize > 0); 7532 7533 encoding = code_page_name(code_page, &encoding_obj); 7534 if (encoding == NULL) 7535 return -1; 7536 7537 if (errors == NULL || strcmp(errors, "strict") == 0) { 7538 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7539 then we raise a UnicodeEncodeError. */ 7540 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7541 if (exc != NULL) { 7542 PyCodec_StrictErrors(exc); 7543 Py_DECREF(exc); 7544 } 7545 Py_XDECREF(encoding_obj); 7546 return -1; 7547 } 7548 7549 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7550 pusedDefaultChar = &usedDefaultChar; 7551 else 7552 pusedDefaultChar = NULL; 7553 7554 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7555 PyErr_NoMemory(); 7556 goto error; 7557 } 7558 outsize = insize * Py_ARRAY_LENGTH(buffer); 7559 7560 if (*outbytes == NULL) { 7561 /* Create string object */ 7562 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7563 if (*outbytes == NULL) 7564 goto error; 7565 out = PyBytes_AS_STRING(*outbytes); 7566 } 7567 else { 7568 /* Extend string object */ 7569 Py_ssize_t n = PyBytes_Size(*outbytes); 7570 if (n > PY_SSIZE_T_MAX - outsize) { 7571 PyErr_NoMemory(); 7572 goto error; 7573 } 7574 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7575 goto error; 7576 out = PyBytes_AS_STRING(*outbytes) + n; 7577 } 7578 7579 /* Encode the string character per character */ 7580 while (pos < endin) 7581 { 7582 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7583 wchar_t chars[2]; 7584 int charsize; 7585 if (ch < 0x10000) { 7586 chars[0] = (wchar_t)ch; 7587 charsize = 1; 7588 } 7589 else { 7590 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7591 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7592 charsize = 2; 7593 } 7594 7595 outsize = WideCharToMultiByte(code_page, flags, 7596 chars, charsize, 7597 buffer, Py_ARRAY_LENGTH(buffer), 7598 NULL, pusedDefaultChar); 7599 if (outsize > 0) { 7600 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7601 { 7602 pos++; 7603 memcpy(out, buffer, outsize); 7604 out += outsize; 7605 continue; 7606 } 7607 } 7608 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7609 PyErr_SetFromWindowsErr(0); 7610 goto error; 7611 } 7612 7613 rep = unicode_encode_call_errorhandler( 7614 errors, &errorHandler, encoding, reason, 7615 unicode, &exc, 7616 pos, pos + 1, &newpos); 7617 if (rep == NULL) 7618 goto error; 7619 pos = newpos; 7620 7621 if (PyBytes_Check(rep)) { 7622 outsize = PyBytes_GET_SIZE(rep); 7623 if (outsize != 1) { 7624 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7625 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7626 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7627 Py_DECREF(rep); 7628 goto error; 7629 } 7630 out = PyBytes_AS_STRING(*outbytes) + offset; 7631 } 7632 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7633 out += outsize; 7634 } 7635 else { 7636 Py_ssize_t i; 7637 enum PyUnicode_Kind kind; 7638 void *data; 7639 7640 if (PyUnicode_READY(rep) == -1) { 7641 Py_DECREF(rep); 7642 goto error; 7643 } 7644 7645 outsize = PyUnicode_GET_LENGTH(rep); 7646 if (outsize != 1) { 7647 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7648 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7649 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7650 Py_DECREF(rep); 7651 goto error; 7652 } 7653 out = PyBytes_AS_STRING(*outbytes) + offset; 7654 } 7655 kind = PyUnicode_KIND(rep); 7656 data = PyUnicode_DATA(rep); 7657 for (i=0; i < outsize; i++) { 7658 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7659 if (ch > 127) { 7660 raise_encode_exception(&exc, 7661 encoding, unicode, 7662 pos, pos + 1, 7663 "unable to encode error handler result to ASCII"); 7664 Py_DECREF(rep); 7665 goto error; 7666 } 7667 *out = (unsigned char)ch; 7668 out++; 7669 } 7670 } 7671 Py_DECREF(rep); 7672 } 7673 /* write a NUL byte */ 7674 *out = 0; 7675 outsize = out - PyBytes_AS_STRING(*outbytes); 7676 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7677 if (_PyBytes_Resize(outbytes, outsize) < 0) 7678 goto error; 7679 ret = 0; 7680 7681 error: 7682 Py_XDECREF(encoding_obj); 7683 Py_XDECREF(errorHandler); 7684 Py_XDECREF(exc); 7685 return ret; 7686 } 7687 7688 static PyObject * 7689 encode_code_page(int code_page, 7690 PyObject *unicode, 7691 const char *errors) 7692 { 7693 Py_ssize_t len; 7694 PyObject *outbytes = NULL; 7695 Py_ssize_t offset; 7696 int chunk_len, ret, done; 7697 7698 if (!PyUnicode_Check(unicode)) { 7699 PyErr_BadArgument(); 7700 return NULL; 7701 } 7702 7703 if (PyUnicode_READY(unicode) == -1) 7704 return NULL; 7705 len = PyUnicode_GET_LENGTH(unicode); 7706 7707 if (code_page < 0) { 7708 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7709 return NULL; 7710 } 7711 7712 if (len == 0) 7713 return PyBytes_FromStringAndSize(NULL, 0); 7714 7715 offset = 0; 7716 do 7717 { 7718 #ifdef NEED_RETRY 7719 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7720 chunks. */ 7721 if (len > INT_MAX/2) { 7722 chunk_len = INT_MAX/2; 7723 done = 0; 7724 } 7725 else 7726 #endif 7727 { 7728 chunk_len = (int)len; 7729 done = 1; 7730 } 7731 7732 ret = encode_code_page_strict(code_page, &outbytes, 7733 unicode, offset, chunk_len, 7734 errors); 7735 if (ret == -2) 7736 ret = encode_code_page_errors(code_page, &outbytes, 7737 unicode, offset, 7738 chunk_len, errors); 7739 if (ret < 0) { 7740 Py_XDECREF(outbytes); 7741 return NULL; 7742 } 7743 7744 offset += chunk_len; 7745 len -= chunk_len; 7746 } while (!done); 7747 7748 return outbytes; 7749 } 7750 7751 PyObject * 7752 PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7753 Py_ssize_t size, 7754 const char *errors) 7755 { 7756 PyObject *unicode, *res; 7757 unicode = PyUnicode_FromUnicode(p, size); 7758 if (unicode == NULL) 7759 return NULL; 7760 res = encode_code_page(CP_ACP, unicode, errors); 7761 Py_DECREF(unicode); 7762 return res; 7763 } 7764 7765 PyObject * 7766 PyUnicode_EncodeCodePage(int code_page, 7767 PyObject *unicode, 7768 const char *errors) 7769 { 7770 return encode_code_page(code_page, unicode, errors); 7771 } 7772 7773 PyObject * 7774 PyUnicode_AsMBCSString(PyObject *unicode) 7775 { 7776 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7777 } 7778 7779 #undef NEED_RETRY 7780 7781 #endif /* MS_WINDOWS */ 7782 7783 /* --- Character Mapping Codec -------------------------------------------- */ 7784 7785 static int 7786 charmap_decode_string(const char *s, 7787 Py_ssize_t size, 7788 PyObject *mapping, 7789 const char *errors, 7790 _PyUnicodeWriter *writer) 7791 { 7792 const char *starts = s; 7793 const char *e; 7794 Py_ssize_t startinpos, endinpos; 7795 PyObject *errorHandler = NULL, *exc = NULL; 7796 Py_ssize_t maplen; 7797 enum PyUnicode_Kind mapkind; 7798 void *mapdata; 7799 Py_UCS4 x; 7800 unsigned char ch; 7801 7802 if (PyUnicode_READY(mapping) == -1) 7803 return -1; 7804 7805 maplen = PyUnicode_GET_LENGTH(mapping); 7806 mapdata = PyUnicode_DATA(mapping); 7807 mapkind = PyUnicode_KIND(mapping); 7808 7809 e = s + size; 7810 7811 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7812 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7813 * is disabled in encoding aliases, latin1 is preferred because 7814 * its implementation is faster. */ 7815 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7816 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7817 Py_UCS4 maxchar = writer->maxchar; 7818 7819 assert (writer->kind == PyUnicode_1BYTE_KIND); 7820 while (s < e) { 7821 ch = *s; 7822 x = mapdata_ucs1[ch]; 7823 if (x > maxchar) { 7824 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7825 goto onError; 7826 maxchar = writer->maxchar; 7827 outdata = (Py_UCS1 *)writer->data; 7828 } 7829 outdata[writer->pos] = x; 7830 writer->pos++; 7831 ++s; 7832 } 7833 return 0; 7834 } 7835 7836 while (s < e) { 7837 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7838 enum PyUnicode_Kind outkind = writer->kind; 7839 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7840 if (outkind == PyUnicode_1BYTE_KIND) { 7841 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7842 Py_UCS4 maxchar = writer->maxchar; 7843 while (s < e) { 7844 ch = *s; 7845 x = mapdata_ucs2[ch]; 7846 if (x > maxchar) 7847 goto Error; 7848 outdata[writer->pos] = x; 7849 writer->pos++; 7850 ++s; 7851 } 7852 break; 7853 } 7854 else if (outkind == PyUnicode_2BYTE_KIND) { 7855 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7856 while (s < e) { 7857 ch = *s; 7858 x = mapdata_ucs2[ch]; 7859 if (x == 0xFFFE) 7860 goto Error; 7861 outdata[writer->pos] = x; 7862 writer->pos++; 7863 ++s; 7864 } 7865 break; 7866 } 7867 } 7868 ch = *s; 7869 7870 if (ch < maplen) 7871 x = PyUnicode_READ(mapkind, mapdata, ch); 7872 else 7873 x = 0xfffe; /* invalid value */ 7874 Error: 7875 if (x == 0xfffe) 7876 { 7877 /* undefined mapping */ 7878 startinpos = s-starts; 7879 endinpos = startinpos+1; 7880 if (unicode_decode_call_errorhandler_writer( 7881 errors, &errorHandler, 7882 "charmap", "character maps to <undefined>", 7883 &starts, &e, &startinpos, &endinpos, &exc, &s, 7884 writer)) { 7885 goto onError; 7886 } 7887 continue; 7888 } 7889 7890 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7891 goto onError; 7892 ++s; 7893 } 7894 Py_XDECREF(errorHandler); 7895 Py_XDECREF(exc); 7896 return 0; 7897 7898 onError: 7899 Py_XDECREF(errorHandler); 7900 Py_XDECREF(exc); 7901 return -1; 7902 } 7903 7904 static int 7905 charmap_decode_mapping(const char *s, 7906 Py_ssize_t size, 7907 PyObject *mapping, 7908 const char *errors, 7909 _PyUnicodeWriter *writer) 7910 { 7911 const char *starts = s; 7912 const char *e; 7913 Py_ssize_t startinpos, endinpos; 7914 PyObject *errorHandler = NULL, *exc = NULL; 7915 unsigned char ch; 7916 PyObject *key, *item = NULL; 7917 7918 e = s + size; 7919 7920 while (s < e) { 7921 ch = *s; 7922 7923 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7924 key = PyLong_FromLong((long)ch); 7925 if (key == NULL) 7926 goto onError; 7927 7928 item = PyObject_GetItem(mapping, key); 7929 Py_DECREF(key); 7930 if (item == NULL) { 7931 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7932 /* No mapping found means: mapping is undefined. */ 7933 PyErr_Clear(); 7934 goto Undefined; 7935 } else 7936 goto onError; 7937 } 7938 7939 /* Apply mapping */ 7940 if (item == Py_None) 7941 goto Undefined; 7942 if (PyLong_Check(item)) { 7943 long value = PyLong_AS_LONG(item); 7944 if (value == 0xFFFE) 7945 goto Undefined; 7946 if (value < 0 || value > MAX_UNICODE) { 7947 PyErr_Format(PyExc_TypeError, 7948 "character mapping must be in range(0x%lx)", 7949 (unsigned long)MAX_UNICODE + 1); 7950 goto onError; 7951 } 7952 7953 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7954 goto onError; 7955 } 7956 else if (PyUnicode_Check(item)) { 7957 if (PyUnicode_READY(item) == -1) 7958 goto onError; 7959 if (PyUnicode_GET_LENGTH(item) == 1) { 7960 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7961 if (value == 0xFFFE) 7962 goto Undefined; 7963 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7964 goto onError; 7965 } 7966 else { 7967 writer->overallocate = 1; 7968 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7969 goto onError; 7970 } 7971 } 7972 else { 7973 /* wrong return value */ 7974 PyErr_SetString(PyExc_TypeError, 7975 "character mapping must return integer, None or str"); 7976 goto onError; 7977 } 7978 Py_CLEAR(item); 7979 ++s; 7980 continue; 7981 7982 Undefined: 7983 /* undefined mapping */ 7984 Py_CLEAR(item); 7985 startinpos = s-starts; 7986 endinpos = startinpos+1; 7987 if (unicode_decode_call_errorhandler_writer( 7988 errors, &errorHandler, 7989 "charmap", "character maps to <undefined>", 7990 &starts, &e, &startinpos, &endinpos, &exc, &s, 7991 writer)) { 7992 goto onError; 7993 } 7994 } 7995 Py_XDECREF(errorHandler); 7996 Py_XDECREF(exc); 7997 return 0; 7998 7999 onError: 8000 Py_XDECREF(item); 8001 Py_XDECREF(errorHandler); 8002 Py_XDECREF(exc); 8003 return -1; 8004 } 8005 8006 PyObject * 8007 PyUnicode_DecodeCharmap(const char *s, 8008 Py_ssize_t size, 8009 PyObject *mapping, 8010 const char *errors) 8011 { 8012 _PyUnicodeWriter writer; 8013 8014 /* Default to Latin-1 */ 8015 if (mapping == NULL) 8016 return PyUnicode_DecodeLatin1(s, size, errors); 8017 8018 if (size == 0) 8019 _Py_RETURN_UNICODE_EMPTY(); 8020 _PyUnicodeWriter_Init(&writer); 8021 writer.min_length = size; 8022 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 8023 goto onError; 8024 8025 if (PyUnicode_CheckExact(mapping)) { 8026 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 8027 goto onError; 8028 } 8029 else { 8030 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 8031 goto onError; 8032 } 8033 return _PyUnicodeWriter_Finish(&writer); 8034 8035 onError: 8036 _PyUnicodeWriter_Dealloc(&writer); 8037 return NULL; 8038 } 8039 8040 /* Charmap encoding: the lookup table */ 8041 8042 struct encoding_map { 8043 PyObject_HEAD 8044 unsigned char level1[32]; 8045 int count2, count3; 8046 unsigned char level23[1]; 8047 }; 8048 8049 static PyObject* 8050 encoding_map_size(PyObject *obj, PyObject* args) 8051 { 8052 struct encoding_map *map = (struct encoding_map*)obj; 8053 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 8054 128*map->count3); 8055 } 8056 8057 static PyMethodDef encoding_map_methods[] = { 8058 {"size", encoding_map_size, METH_NOARGS, 8059 PyDoc_STR("Return the size (in bytes) of this object") }, 8060 { 0 } 8061 }; 8062 8063 static void 8064 encoding_map_dealloc(PyObject* o) 8065 { 8066 PyObject_FREE(o); 8067 } 8068 8069 static PyTypeObject EncodingMapType = { 8070 PyVarObject_HEAD_INIT(NULL, 0) 8071 "EncodingMap", /*tp_name*/ 8072 sizeof(struct encoding_map), /*tp_basicsize*/ 8073 0, /*tp_itemsize*/ 8074 /* methods */ 8075 encoding_map_dealloc, /*tp_dealloc*/ 8076 0, /*tp_print*/ 8077 0, /*tp_getattr*/ 8078 0, /*tp_setattr*/ 8079 0, /*tp_reserved*/ 8080 0, /*tp_repr*/ 8081 0, /*tp_as_number*/ 8082 0, /*tp_as_sequence*/ 8083 0, /*tp_as_mapping*/ 8084 0, /*tp_hash*/ 8085 0, /*tp_call*/ 8086 0, /*tp_str*/ 8087 0, /*tp_getattro*/ 8088 0, /*tp_setattro*/ 8089 0, /*tp_as_buffer*/ 8090 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 8091 0, /*tp_doc*/ 8092 0, /*tp_traverse*/ 8093 0, /*tp_clear*/ 8094 0, /*tp_richcompare*/ 8095 0, /*tp_weaklistoffset*/ 8096 0, /*tp_iter*/ 8097 0, /*tp_iternext*/ 8098 encoding_map_methods, /*tp_methods*/ 8099 0, /*tp_members*/ 8100 0, /*tp_getset*/ 8101 0, /*tp_base*/ 8102 0, /*tp_dict*/ 8103 0, /*tp_descr_get*/ 8104 0, /*tp_descr_set*/ 8105 0, /*tp_dictoffset*/ 8106 0, /*tp_init*/ 8107 0, /*tp_alloc*/ 8108 0, /*tp_new*/ 8109 0, /*tp_free*/ 8110 0, /*tp_is_gc*/ 8111 }; 8112 8113 PyObject* 8114 PyUnicode_BuildEncodingMap(PyObject* string) 8115 { 8116 PyObject *result; 8117 struct encoding_map *mresult; 8118 int i; 8119 int need_dict = 0; 8120 unsigned char level1[32]; 8121 unsigned char level2[512]; 8122 unsigned char *mlevel1, *mlevel2, *mlevel3; 8123 int count2 = 0, count3 = 0; 8124 int kind; 8125 void *data; 8126 Py_ssize_t length; 8127 Py_UCS4 ch; 8128 8129 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 8130 PyErr_BadArgument(); 8131 return NULL; 8132 } 8133 kind = PyUnicode_KIND(string); 8134 data = PyUnicode_DATA(string); 8135 length = PyUnicode_GET_LENGTH(string); 8136 length = Py_MIN(length, 256); 8137 memset(level1, 0xFF, sizeof level1); 8138 memset(level2, 0xFF, sizeof level2); 8139 8140 /* If there isn't a one-to-one mapping of NULL to \0, 8141 or if there are non-BMP characters, we need to use 8142 a mapping dictionary. */ 8143 if (PyUnicode_READ(kind, data, 0) != 0) 8144 need_dict = 1; 8145 for (i = 1; i < length; i++) { 8146 int l1, l2; 8147 ch = PyUnicode_READ(kind, data, i); 8148 if (ch == 0 || ch > 0xFFFF) { 8149 need_dict = 1; 8150 break; 8151 } 8152 if (ch == 0xFFFE) 8153 /* unmapped character */ 8154 continue; 8155 l1 = ch >> 11; 8156 l2 = ch >> 7; 8157 if (level1[l1] == 0xFF) 8158 level1[l1] = count2++; 8159 if (level2[l2] == 0xFF) 8160 level2[l2] = count3++; 8161 } 8162 8163 if (count2 >= 0xFF || count3 >= 0xFF) 8164 need_dict = 1; 8165 8166 if (need_dict) { 8167 PyObject *result = PyDict_New(); 8168 PyObject *key, *value; 8169 if (!result) 8170 return NULL; 8171 for (i = 0; i < length; i++) { 8172 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 8173 value = PyLong_FromLong(i); 8174 if (!key || !value) 8175 goto failed1; 8176 if (PyDict_SetItem(result, key, value) == -1) 8177 goto failed1; 8178 Py_DECREF(key); 8179 Py_DECREF(value); 8180 } 8181 return result; 8182 failed1: 8183 Py_XDECREF(key); 8184 Py_XDECREF(value); 8185 Py_DECREF(result); 8186 return NULL; 8187 } 8188 8189 /* Create a three-level trie */ 8190 result = PyObject_MALLOC(sizeof(struct encoding_map) + 8191 16*count2 + 128*count3 - 1); 8192 if (!result) 8193 return PyErr_NoMemory(); 8194 PyObject_Init(result, &EncodingMapType); 8195 mresult = (struct encoding_map*)result; 8196 mresult->count2 = count2; 8197 mresult->count3 = count3; 8198 mlevel1 = mresult->level1; 8199 mlevel2 = mresult->level23; 8200 mlevel3 = mresult->level23 + 16*count2; 8201 memcpy(mlevel1, level1, 32); 8202 memset(mlevel2, 0xFF, 16*count2); 8203 memset(mlevel3, 0, 128*count3); 8204 count3 = 0; 8205 for (i = 1; i < length; i++) { 8206 int o1, o2, o3, i2, i3; 8207 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8208 if (ch == 0xFFFE) 8209 /* unmapped character */ 8210 continue; 8211 o1 = ch>>11; 8212 o2 = (ch>>7) & 0xF; 8213 i2 = 16*mlevel1[o1] + o2; 8214 if (mlevel2[i2] == 0xFF) 8215 mlevel2[i2] = count3++; 8216 o3 = ch & 0x7F; 8217 i3 = 128*mlevel2[i2] + o3; 8218 mlevel3[i3] = i; 8219 } 8220 return result; 8221 } 8222 8223 static int 8224 encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 8225 { 8226 struct encoding_map *map = (struct encoding_map*)mapping; 8227 int l1 = c>>11; 8228 int l2 = (c>>7) & 0xF; 8229 int l3 = c & 0x7F; 8230 int i; 8231 8232 if (c > 0xFFFF) 8233 return -1; 8234 if (c == 0) 8235 return 0; 8236 /* level 1*/ 8237 i = map->level1[l1]; 8238 if (i == 0xFF) { 8239 return -1; 8240 } 8241 /* level 2*/ 8242 i = map->level23[16*i+l2]; 8243 if (i == 0xFF) { 8244 return -1; 8245 } 8246 /* level 3 */ 8247 i = map->level23[16*map->count2 + 128*i + l3]; 8248 if (i == 0) { 8249 return -1; 8250 } 8251 return i; 8252 } 8253 8254 /* Lookup the character ch in the mapping. If the character 8255 can't be found, Py_None is returned (or NULL, if another 8256 error occurred). */ 8257 static PyObject * 8258 charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8259 { 8260 PyObject *w = PyLong_FromLong((long)c); 8261 PyObject *x; 8262 8263 if (w == NULL) 8264 return NULL; 8265 x = PyObject_GetItem(mapping, w); 8266 Py_DECREF(w); 8267 if (x == NULL) { 8268 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8269 /* No mapping found means: mapping is undefined. */ 8270 PyErr_Clear(); 8271 x = Py_None; 8272 Py_INCREF(x); 8273 return x; 8274 } else 8275 return NULL; 8276 } 8277 else if (x == Py_None) 8278 return x; 8279 else if (PyLong_Check(x)) { 8280 long value = PyLong_AS_LONG(x); 8281 if (value < 0 || value > 255) { 8282 PyErr_SetString(PyExc_TypeError, 8283 "character mapping must be in range(256)"); 8284 Py_DECREF(x); 8285 return NULL; 8286 } 8287 return x; 8288 } 8289 else if (PyBytes_Check(x)) 8290 return x; 8291 else { 8292 /* wrong return value */ 8293 PyErr_Format(PyExc_TypeError, 8294 "character mapping must return integer, bytes or None, not %.400s", 8295 x->ob_type->tp_name); 8296 Py_DECREF(x); 8297 return NULL; 8298 } 8299 } 8300 8301 static int 8302 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8303 { 8304 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8305 /* exponentially overallocate to minimize reallocations */ 8306 if (requiredsize < 2*outsize) 8307 requiredsize = 2*outsize; 8308 if (_PyBytes_Resize(outobj, requiredsize)) 8309 return -1; 8310 return 0; 8311 } 8312 8313 typedef enum charmapencode_result { 8314 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8315 } charmapencode_result; 8316 /* lookup the character, put the result in the output string and adjust 8317 various state variables. Resize the output bytes object if not enough 8318 space is available. Return a new reference to the object that 8319 was put in the output buffer, or Py_None, if the mapping was undefined 8320 (in which case no character was written) or NULL, if a 8321 reallocation error occurred. The caller must decref the result */ 8322 static charmapencode_result 8323 charmapencode_output(Py_UCS4 c, PyObject *mapping, 8324 PyObject **outobj, Py_ssize_t *outpos) 8325 { 8326 PyObject *rep; 8327 char *outstart; 8328 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8329 8330 if (Py_TYPE(mapping) == &EncodingMapType) { 8331 int res = encoding_map_lookup(c, mapping); 8332 Py_ssize_t requiredsize = *outpos+1; 8333 if (res == -1) 8334 return enc_FAILED; 8335 if (outsize<requiredsize) 8336 if (charmapencode_resize(outobj, outpos, requiredsize)) 8337 return enc_EXCEPTION; 8338 outstart = PyBytes_AS_STRING(*outobj); 8339 outstart[(*outpos)++] = (char)res; 8340 return enc_SUCCESS; 8341 } 8342 8343 rep = charmapencode_lookup(c, mapping); 8344 if (rep==NULL) 8345 return enc_EXCEPTION; 8346 else if (rep==Py_None) { 8347 Py_DECREF(rep); 8348 return enc_FAILED; 8349 } else { 8350 if (PyLong_Check(rep)) { 8351 Py_ssize_t requiredsize = *outpos+1; 8352 if (outsize<requiredsize) 8353 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8354 Py_DECREF(rep); 8355 return enc_EXCEPTION; 8356 } 8357 outstart = PyBytes_AS_STRING(*outobj); 8358 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8359 } 8360 else { 8361 const char *repchars = PyBytes_AS_STRING(rep); 8362 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8363 Py_ssize_t requiredsize = *outpos+repsize; 8364 if (outsize<requiredsize) 8365 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8366 Py_DECREF(rep); 8367 return enc_EXCEPTION; 8368 } 8369 outstart = PyBytes_AS_STRING(*outobj); 8370 memcpy(outstart + *outpos, repchars, repsize); 8371 *outpos += repsize; 8372 } 8373 } 8374 Py_DECREF(rep); 8375 return enc_SUCCESS; 8376 } 8377 8378 /* handle an error in PyUnicode_EncodeCharmap 8379 Return 0 on success, -1 on error */ 8380 static int 8381 charmap_encoding_error( 8382 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8383 PyObject **exceptionObject, 8384 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8385 PyObject **res, Py_ssize_t *respos) 8386 { 8387 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8388 Py_ssize_t size, repsize; 8389 Py_ssize_t newpos; 8390 enum PyUnicode_Kind kind; 8391 void *data; 8392 Py_ssize_t index; 8393 /* startpos for collecting unencodable chars */ 8394 Py_ssize_t collstartpos = *inpos; 8395 Py_ssize_t collendpos = *inpos+1; 8396 Py_ssize_t collpos; 8397 char *encoding = "charmap"; 8398 char *reason = "character maps to <undefined>"; 8399 charmapencode_result x; 8400 Py_UCS4 ch; 8401 int val; 8402 8403 if (PyUnicode_READY(unicode) == -1) 8404 return -1; 8405 size = PyUnicode_GET_LENGTH(unicode); 8406 /* find all unencodable characters */ 8407 while (collendpos < size) { 8408 PyObject *rep; 8409 if (Py_TYPE(mapping) == &EncodingMapType) { 8410 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8411 val = encoding_map_lookup(ch, mapping); 8412 if (val != -1) 8413 break; 8414 ++collendpos; 8415 continue; 8416 } 8417 8418 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8419 rep = charmapencode_lookup(ch, mapping); 8420 if (rep==NULL) 8421 return -1; 8422 else if (rep!=Py_None) { 8423 Py_DECREF(rep); 8424 break; 8425 } 8426 Py_DECREF(rep); 8427 ++collendpos; 8428 } 8429 /* cache callback name lookup 8430 * (if not done yet, i.e. it's the first error) */ 8431 if (*error_handler == _Py_ERROR_UNKNOWN) 8432 *error_handler = get_error_handler(errors); 8433 8434 switch (*error_handler) { 8435 case _Py_ERROR_STRICT: 8436 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8437 return -1; 8438 8439 case _Py_ERROR_REPLACE: 8440 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8441 x = charmapencode_output('?', mapping, res, respos); 8442 if (x==enc_EXCEPTION) { 8443 return -1; 8444 } 8445 else if (x==enc_FAILED) { 8446 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8447 return -1; 8448 } 8449 } 8450 /* fall through */ 8451 case _Py_ERROR_IGNORE: 8452 *inpos = collendpos; 8453 break; 8454 8455 case _Py_ERROR_XMLCHARREFREPLACE: 8456 /* generate replacement (temporarily (mis)uses p) */ 8457 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8458 char buffer[2+29+1+1]; 8459 char *cp; 8460 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8461 for (cp = buffer; *cp; ++cp) { 8462 x = charmapencode_output(*cp, mapping, res, respos); 8463 if (x==enc_EXCEPTION) 8464 return -1; 8465 else if (x==enc_FAILED) { 8466 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8467 return -1; 8468 } 8469 } 8470 } 8471 *inpos = collendpos; 8472 break; 8473 8474 default: 8475 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8476 encoding, reason, unicode, exceptionObject, 8477 collstartpos, collendpos, &newpos); 8478 if (repunicode == NULL) 8479 return -1; 8480 if (PyBytes_Check(repunicode)) { 8481 /* Directly copy bytes result to output. */ 8482 Py_ssize_t outsize = PyBytes_Size(*res); 8483 Py_ssize_t requiredsize; 8484 repsize = PyBytes_Size(repunicode); 8485 requiredsize = *respos + repsize; 8486 if (requiredsize > outsize) 8487 /* Make room for all additional bytes. */ 8488 if (charmapencode_resize(res, respos, requiredsize)) { 8489 Py_DECREF(repunicode); 8490 return -1; 8491 } 8492 memcpy(PyBytes_AsString(*res) + *respos, 8493 PyBytes_AsString(repunicode), repsize); 8494 *respos += repsize; 8495 *inpos = newpos; 8496 Py_DECREF(repunicode); 8497 break; 8498 } 8499 /* generate replacement */ 8500 if (PyUnicode_READY(repunicode) == -1) { 8501 Py_DECREF(repunicode); 8502 return -1; 8503 } 8504 repsize = PyUnicode_GET_LENGTH(repunicode); 8505 data = PyUnicode_DATA(repunicode); 8506 kind = PyUnicode_KIND(repunicode); 8507 for (index = 0; index < repsize; index++) { 8508 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8509 x = charmapencode_output(repch, mapping, res, respos); 8510 if (x==enc_EXCEPTION) { 8511 Py_DECREF(repunicode); 8512 return -1; 8513 } 8514 else if (x==enc_FAILED) { 8515 Py_DECREF(repunicode); 8516 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8517 return -1; 8518 } 8519 } 8520 *inpos = newpos; 8521 Py_DECREF(repunicode); 8522 } 8523 return 0; 8524 } 8525 8526 PyObject * 8527 _PyUnicode_EncodeCharmap(PyObject *unicode, 8528 PyObject *mapping, 8529 const char *errors) 8530 { 8531 /* output object */ 8532 PyObject *res = NULL; 8533 /* current input position */ 8534 Py_ssize_t inpos = 0; 8535 Py_ssize_t size; 8536 /* current output position */ 8537 Py_ssize_t respos = 0; 8538 PyObject *error_handler_obj = NULL; 8539 PyObject *exc = NULL; 8540 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8541 void *data; 8542 int kind; 8543 8544 if (PyUnicode_READY(unicode) == -1) 8545 return NULL; 8546 size = PyUnicode_GET_LENGTH(unicode); 8547 data = PyUnicode_DATA(unicode); 8548 kind = PyUnicode_KIND(unicode); 8549 8550 /* Default to Latin-1 */ 8551 if (mapping == NULL) 8552 return unicode_encode_ucs1(unicode, errors, 256); 8553 8554 /* allocate enough for a simple encoding without 8555 replacements, if we need more, we'll resize */ 8556 res = PyBytes_FromStringAndSize(NULL, size); 8557 if (res == NULL) 8558 goto onError; 8559 if (size == 0) 8560 return res; 8561 8562 while (inpos<size) { 8563 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8564 /* try to encode it */ 8565 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8566 if (x==enc_EXCEPTION) /* error */ 8567 goto onError; 8568 if (x==enc_FAILED) { /* unencodable character */ 8569 if (charmap_encoding_error(unicode, &inpos, mapping, 8570 &exc, 8571 &error_handler, &error_handler_obj, errors, 8572 &res, &respos)) { 8573 goto onError; 8574 } 8575 } 8576 else 8577 /* done with this character => adjust input position */ 8578 ++inpos; 8579 } 8580 8581 /* Resize if we allocated to much */ 8582 if (respos<PyBytes_GET_SIZE(res)) 8583 if (_PyBytes_Resize(&res, respos) < 0) 8584 goto onError; 8585 8586 Py_XDECREF(exc); 8587 Py_XDECREF(error_handler_obj); 8588 return res; 8589 8590 onError: 8591 Py_XDECREF(res); 8592 Py_XDECREF(exc); 8593 Py_XDECREF(error_handler_obj); 8594 return NULL; 8595 } 8596 8597 /* Deprecated */ 8598 PyObject * 8599 PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8600 Py_ssize_t size, 8601 PyObject *mapping, 8602 const char *errors) 8603 { 8604 PyObject *result; 8605 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8606 if (unicode == NULL) 8607 return NULL; 8608 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8609 Py_DECREF(unicode); 8610 return result; 8611 } 8612 8613 PyObject * 8614 PyUnicode_AsCharmapString(PyObject *unicode, 8615 PyObject *mapping) 8616 { 8617 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8618 PyErr_BadArgument(); 8619 return NULL; 8620 } 8621 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8622 } 8623 8624 /* create or adjust a UnicodeTranslateError */ 8625 static void 8626 make_translate_exception(PyObject **exceptionObject, 8627 PyObject *unicode, 8628 Py_ssize_t startpos, Py_ssize_t endpos, 8629 const char *reason) 8630 { 8631 if (*exceptionObject == NULL) { 8632 *exceptionObject = _PyUnicodeTranslateError_Create( 8633 unicode, startpos, endpos, reason); 8634 } 8635 else { 8636 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8637 goto onError; 8638 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8639 goto onError; 8640 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8641 goto onError; 8642 return; 8643 onError: 8644 Py_CLEAR(*exceptionObject); 8645 } 8646 } 8647 8648 /* error handling callback helper: 8649 build arguments, call the callback and check the arguments, 8650 put the result into newpos and return the replacement string, which 8651 has to be freed by the caller */ 8652 static PyObject * 8653 unicode_translate_call_errorhandler(const char *errors, 8654 PyObject **errorHandler, 8655 const char *reason, 8656 PyObject *unicode, PyObject **exceptionObject, 8657 Py_ssize_t startpos, Py_ssize_t endpos, 8658 Py_ssize_t *newpos) 8659 { 8660 static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8661 8662 Py_ssize_t i_newpos; 8663 PyObject *restuple; 8664 PyObject *resunicode; 8665 8666 if (*errorHandler == NULL) { 8667 *errorHandler = PyCodec_LookupError(errors); 8668 if (*errorHandler == NULL) 8669 return NULL; 8670 } 8671 8672 make_translate_exception(exceptionObject, 8673 unicode, startpos, endpos, reason); 8674 if (*exceptionObject == NULL) 8675 return NULL; 8676 8677 restuple = PyObject_CallFunctionObjArgs( 8678 *errorHandler, *exceptionObject, NULL); 8679 if (restuple == NULL) 8680 return NULL; 8681 if (!PyTuple_Check(restuple)) { 8682 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8683 Py_DECREF(restuple); 8684 return NULL; 8685 } 8686 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8687 &resunicode, &i_newpos)) { 8688 Py_DECREF(restuple); 8689 return NULL; 8690 } 8691 if (i_newpos<0) 8692 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8693 else 8694 *newpos = i_newpos; 8695 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8696 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8697 Py_DECREF(restuple); 8698 return NULL; 8699 } 8700 Py_INCREF(resunicode); 8701 Py_DECREF(restuple); 8702 return resunicode; 8703 } 8704 8705 /* Lookup the character ch in the mapping and put the result in result, 8706 which must be decrefed by the caller. 8707 Return 0 on success, -1 on error */ 8708 static int 8709 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8710 { 8711 PyObject *w = PyLong_FromLong((long)c); 8712 PyObject *x; 8713 8714 if (w == NULL) 8715 return -1; 8716 x = PyObject_GetItem(mapping, w); 8717 Py_DECREF(w); 8718 if (x == NULL) { 8719 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8720 /* No mapping found means: use 1:1 mapping. */ 8721 PyErr_Clear(); 8722 *result = NULL; 8723 return 0; 8724 } else 8725 return -1; 8726 } 8727 else if (x == Py_None) { 8728 *result = x; 8729 return 0; 8730 } 8731 else if (PyLong_Check(x)) { 8732 long value = PyLong_AS_LONG(x); 8733 if (value < 0 || value > MAX_UNICODE) { 8734 PyErr_Format(PyExc_ValueError, 8735 "character mapping must be in range(0x%x)", 8736 MAX_UNICODE+1); 8737 Py_DECREF(x); 8738 return -1; 8739 } 8740 *result = x; 8741 return 0; 8742 } 8743 else if (PyUnicode_Check(x)) { 8744 *result = x; 8745 return 0; 8746 } 8747 else { 8748 /* wrong return value */ 8749 PyErr_SetString(PyExc_TypeError, 8750 "character mapping must return integer, None or str"); 8751 Py_DECREF(x); 8752 return -1; 8753 } 8754 } 8755 8756 /* lookup the character, write the result into the writer. 8757 Return 1 if the result was written into the writer, return 0 if the mapping 8758 was undefined, raise an exception return -1 on error. */ 8759 static int 8760 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8761 _PyUnicodeWriter *writer) 8762 { 8763 PyObject *item; 8764 8765 if (charmaptranslate_lookup(ch, mapping, &item)) 8766 return -1; 8767 8768 if (item == NULL) { 8769 /* not found => default to 1:1 mapping */ 8770 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8771 return -1; 8772 } 8773 return 1; 8774 } 8775 8776 if (item == Py_None) { 8777 Py_DECREF(item); 8778 return 0; 8779 } 8780 8781 if (PyLong_Check(item)) { 8782 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8783 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8784 used it */ 8785 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8786 Py_DECREF(item); 8787 return -1; 8788 } 8789 Py_DECREF(item); 8790 return 1; 8791 } 8792 8793 if (!PyUnicode_Check(item)) { 8794 Py_DECREF(item); 8795 return -1; 8796 } 8797 8798 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8799 Py_DECREF(item); 8800 return -1; 8801 } 8802 8803 Py_DECREF(item); 8804 return 1; 8805 } 8806 8807 static int 8808 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8809 Py_UCS1 *translate) 8810 { 8811 PyObject *item = NULL; 8812 int ret = 0; 8813 8814 if (charmaptranslate_lookup(ch, mapping, &item)) { 8815 return -1; 8816 } 8817 8818 if (item == Py_None) { 8819 /* deletion */ 8820 translate[ch] = 0xfe; 8821 } 8822 else if (item == NULL) { 8823 /* not found => default to 1:1 mapping */ 8824 translate[ch] = ch; 8825 return 1; 8826 } 8827 else if (PyLong_Check(item)) { 8828 long replace = PyLong_AS_LONG(item); 8829 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8830 used it */ 8831 if (127 < replace) { 8832 /* invalid character or character outside ASCII: 8833 skip the fast translate */ 8834 goto exit; 8835 } 8836 translate[ch] = (Py_UCS1)replace; 8837 } 8838 else if (PyUnicode_Check(item)) { 8839 Py_UCS4 replace; 8840 8841 if (PyUnicode_READY(item) == -1) { 8842 Py_DECREF(item); 8843 return -1; 8844 } 8845 if (PyUnicode_GET_LENGTH(item) != 1) 8846 goto exit; 8847 8848 replace = PyUnicode_READ_CHAR(item, 0); 8849 if (replace > 127) 8850 goto exit; 8851 translate[ch] = (Py_UCS1)replace; 8852 } 8853 else { 8854 /* not None, NULL, long or unicode */ 8855 goto exit; 8856 } 8857 ret = 1; 8858 8859 exit: 8860 Py_DECREF(item); 8861 return ret; 8862 } 8863 8864 /* Fast path for ascii => ascii translation. Return 1 if the whole string 8865 was translated into writer, return 0 if the input string was partially 8866 translated into writer, raise an exception and return -1 on error. */ 8867 static int 8868 unicode_fast_translate(PyObject *input, PyObject *mapping, 8869 _PyUnicodeWriter *writer, int ignore, 8870 Py_ssize_t *input_pos) 8871 { 8872 Py_UCS1 ascii_table[128], ch, ch2; 8873 Py_ssize_t len; 8874 Py_UCS1 *in, *end, *out; 8875 int res = 0; 8876 8877 len = PyUnicode_GET_LENGTH(input); 8878 8879 memset(ascii_table, 0xff, 128); 8880 8881 in = PyUnicode_1BYTE_DATA(input); 8882 end = in + len; 8883 8884 assert(PyUnicode_IS_ASCII(writer->buffer)); 8885 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8886 out = PyUnicode_1BYTE_DATA(writer->buffer); 8887 8888 for (; in < end; in++) { 8889 ch = *in; 8890 ch2 = ascii_table[ch]; 8891 if (ch2 == 0xff) { 8892 int translate = unicode_fast_translate_lookup(mapping, ch, 8893 ascii_table); 8894 if (translate < 0) 8895 return -1; 8896 if (translate == 0) 8897 goto exit; 8898 ch2 = ascii_table[ch]; 8899 } 8900 if (ch2 == 0xfe) { 8901 if (ignore) 8902 continue; 8903 goto exit; 8904 } 8905 assert(ch2 < 128); 8906 *out = ch2; 8907 out++; 8908 } 8909 res = 1; 8910 8911 exit: 8912 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8913 *input_pos = in - PyUnicode_1BYTE_DATA(input); 8914 return res; 8915 } 8916 8917 static PyObject * 8918 _PyUnicode_TranslateCharmap(PyObject *input, 8919 PyObject *mapping, 8920 const char *errors) 8921 { 8922 /* input object */ 8923 char *data; 8924 Py_ssize_t size, i; 8925 int kind; 8926 /* output buffer */ 8927 _PyUnicodeWriter writer; 8928 /* error handler */ 8929 char *reason = "character maps to <undefined>"; 8930 PyObject *errorHandler = NULL; 8931 PyObject *exc = NULL; 8932 int ignore; 8933 int res; 8934 8935 if (mapping == NULL) { 8936 PyErr_BadArgument(); 8937 return NULL; 8938 } 8939 8940 if (PyUnicode_READY(input) == -1) 8941 return NULL; 8942 data = (char*)PyUnicode_DATA(input); 8943 kind = PyUnicode_KIND(input); 8944 size = PyUnicode_GET_LENGTH(input); 8945 8946 if (size == 0) 8947 return PyUnicode_FromObject(input); 8948 8949 /* allocate enough for a simple 1:1 translation without 8950 replacements, if we need more, we'll resize */ 8951 _PyUnicodeWriter_Init(&writer); 8952 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8953 goto onError; 8954 8955 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8956 8957 if (PyUnicode_READY(input) == -1) 8958 return NULL; 8959 if (PyUnicode_IS_ASCII(input)) { 8960 res = unicode_fast_translate(input, mapping, &writer, ignore, &i); 8961 if (res < 0) { 8962 _PyUnicodeWriter_Dealloc(&writer); 8963 return NULL; 8964 } 8965 if (res == 1) 8966 return _PyUnicodeWriter_Finish(&writer); 8967 } 8968 else { 8969 i = 0; 8970 } 8971 8972 while (i<size) { 8973 /* try to encode it */ 8974 int translate; 8975 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8976 Py_ssize_t newpos; 8977 /* startpos for collecting untranslatable chars */ 8978 Py_ssize_t collstart; 8979 Py_ssize_t collend; 8980 Py_UCS4 ch; 8981 8982 ch = PyUnicode_READ(kind, data, i); 8983 translate = charmaptranslate_output(ch, mapping, &writer); 8984 if (translate < 0) 8985 goto onError; 8986 8987 if (translate != 0) { 8988 /* it worked => adjust input pointer */ 8989 ++i; 8990 continue; 8991 } 8992 8993 /* untranslatable character */ 8994 collstart = i; 8995 collend = i+1; 8996 8997 /* find all untranslatable characters */ 8998 while (collend < size) { 8999 PyObject *x; 9000 ch = PyUnicode_READ(kind, data, collend); 9001 if (charmaptranslate_lookup(ch, mapping, &x)) 9002 goto onError; 9003 Py_XDECREF(x); 9004 if (x != Py_None) 9005 break; 9006 ++collend; 9007 } 9008 9009 if (ignore) { 9010 i = collend; 9011 } 9012 else { 9013 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 9014 reason, input, &exc, 9015 collstart, collend, &newpos); 9016 if (repunicode == NULL) 9017 goto onError; 9018 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 9019 Py_DECREF(repunicode); 9020 goto onError; 9021 } 9022 Py_DECREF(repunicode); 9023 i = newpos; 9024 } 9025 } 9026 Py_XDECREF(exc); 9027 Py_XDECREF(errorHandler); 9028 return _PyUnicodeWriter_Finish(&writer); 9029 9030 onError: 9031 _PyUnicodeWriter_Dealloc(&writer); 9032 Py_XDECREF(exc); 9033 Py_XDECREF(errorHandler); 9034 return NULL; 9035 } 9036 9037 /* Deprecated. Use PyUnicode_Translate instead. */ 9038 PyObject * 9039 PyUnicode_TranslateCharmap(const Py_UNICODE *p, 9040 Py_ssize_t size, 9041 PyObject *mapping, 9042 const char *errors) 9043 { 9044 PyObject *result; 9045 PyObject *unicode = PyUnicode_FromUnicode(p, size); 9046 if (!unicode) 9047 return NULL; 9048 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 9049 Py_DECREF(unicode); 9050 return result; 9051 } 9052 9053 PyObject * 9054 PyUnicode_Translate(PyObject *str, 9055 PyObject *mapping, 9056 const char *errors) 9057 { 9058 if (ensure_unicode(str) < 0) 9059 return NULL; 9060 return _PyUnicode_TranslateCharmap(str, mapping, errors); 9061 } 9062 9063 static Py_UCS4 9064 fix_decimal_and_space_to_ascii(PyObject *self) 9065 { 9066 /* No need to call PyUnicode_READY(self) because this function is only 9067 called as a callback from fixup() which does it already. */ 9068 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9069 const int kind = PyUnicode_KIND(self); 9070 void *data = PyUnicode_DATA(self); 9071 Py_UCS4 maxchar = 127, ch, fixed; 9072 int modified = 0; 9073 Py_ssize_t i; 9074 9075 for (i = 0; i < len; ++i) { 9076 ch = PyUnicode_READ(kind, data, i); 9077 fixed = 0; 9078 if (ch > 127) { 9079 if (Py_UNICODE_ISSPACE(ch)) 9080 fixed = ' '; 9081 else { 9082 const int decimal = Py_UNICODE_TODECIMAL(ch); 9083 if (decimal >= 0) 9084 fixed = '0' + decimal; 9085 } 9086 if (fixed != 0) { 9087 modified = 1; 9088 maxchar = Py_MAX(maxchar, fixed); 9089 PyUnicode_WRITE(kind, data, i, fixed); 9090 } 9091 else 9092 maxchar = Py_MAX(maxchar, ch); 9093 } 9094 } 9095 9096 return (modified) ? maxchar : 0; 9097 } 9098 9099 PyObject * 9100 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 9101 { 9102 if (!PyUnicode_Check(unicode)) { 9103 PyErr_BadInternalCall(); 9104 return NULL; 9105 } 9106 if (PyUnicode_READY(unicode) == -1) 9107 return NULL; 9108 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 9109 /* If the string is already ASCII, just return the same string */ 9110 Py_INCREF(unicode); 9111 return unicode; 9112 } 9113 return fixup(unicode, fix_decimal_and_space_to_ascii); 9114 } 9115 9116 PyObject * 9117 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 9118 Py_ssize_t length) 9119 { 9120 PyObject *decimal; 9121 Py_ssize_t i; 9122 Py_UCS4 maxchar; 9123 enum PyUnicode_Kind kind; 9124 void *data; 9125 9126 maxchar = 127; 9127 for (i = 0; i < length; i++) { 9128 Py_UCS4 ch = s[i]; 9129 if (ch > 127) { 9130 int decimal = Py_UNICODE_TODECIMAL(ch); 9131 if (decimal >= 0) 9132 ch = '0' + decimal; 9133 maxchar = Py_MAX(maxchar, ch); 9134 } 9135 } 9136 9137 /* Copy to a new string */ 9138 decimal = PyUnicode_New(length, maxchar); 9139 if (decimal == NULL) 9140 return decimal; 9141 kind = PyUnicode_KIND(decimal); 9142 data = PyUnicode_DATA(decimal); 9143 /* Iterate over code points */ 9144 for (i = 0; i < length; i++) { 9145 Py_UCS4 ch = s[i]; 9146 if (ch > 127) { 9147 int decimal = Py_UNICODE_TODECIMAL(ch); 9148 if (decimal >= 0) 9149 ch = '0' + decimal; 9150 } 9151 PyUnicode_WRITE(kind, data, i, ch); 9152 } 9153 return unicode_result(decimal); 9154 } 9155 /* --- Decimal Encoder ---------------------------------------------------- */ 9156 9157 int 9158 PyUnicode_EncodeDecimal(Py_UNICODE *s, 9159 Py_ssize_t length, 9160 char *output, 9161 const char *errors) 9162 { 9163 PyObject *unicode; 9164 Py_ssize_t i; 9165 enum PyUnicode_Kind kind; 9166 void *data; 9167 9168 if (output == NULL) { 9169 PyErr_BadArgument(); 9170 return -1; 9171 } 9172 9173 unicode = PyUnicode_FromUnicode(s, length); 9174 if (unicode == NULL) 9175 return -1; 9176 9177 if (PyUnicode_READY(unicode) == -1) { 9178 Py_DECREF(unicode); 9179 return -1; 9180 } 9181 kind = PyUnicode_KIND(unicode); 9182 data = PyUnicode_DATA(unicode); 9183 9184 for (i=0; i < length; ) { 9185 PyObject *exc; 9186 Py_UCS4 ch; 9187 int decimal; 9188 Py_ssize_t startpos; 9189 9190 ch = PyUnicode_READ(kind, data, i); 9191 9192 if (Py_UNICODE_ISSPACE(ch)) { 9193 *output++ = ' '; 9194 i++; 9195 continue; 9196 } 9197 decimal = Py_UNICODE_TODECIMAL(ch); 9198 if (decimal >= 0) { 9199 *output++ = '0' + decimal; 9200 i++; 9201 continue; 9202 } 9203 if (0 < ch && ch < 256) { 9204 *output++ = (char)ch; 9205 i++; 9206 continue; 9207 } 9208 9209 startpos = i; 9210 exc = NULL; 9211 raise_encode_exception(&exc, "decimal", unicode, 9212 startpos, startpos+1, 9213 "invalid decimal Unicode string"); 9214 Py_XDECREF(exc); 9215 Py_DECREF(unicode); 9216 return -1; 9217 } 9218 /* 0-terminate the output string */ 9219 *output++ = '\0'; 9220 Py_DECREF(unicode); 9221 return 0; 9222 } 9223 9224 /* --- Helpers ------------------------------------------------------------ */ 9225 9226 /* helper macro to fixup start/end slice values */ 9227 #define ADJUST_INDICES(start, end, len) \ 9228 if (end > len) \ 9229 end = len; \ 9230 else if (end < 0) { \ 9231 end += len; \ 9232 if (end < 0) \ 9233 end = 0; \ 9234 } \ 9235 if (start < 0) { \ 9236 start += len; \ 9237 if (start < 0) \ 9238 start = 0; \ 9239 } 9240 9241 static Py_ssize_t 9242 any_find_slice(PyObject* s1, PyObject* s2, 9243 Py_ssize_t start, 9244 Py_ssize_t end, 9245 int direction) 9246 { 9247 int kind1, kind2; 9248 void *buf1, *buf2; 9249 Py_ssize_t len1, len2, result; 9250 9251 kind1 = PyUnicode_KIND(s1); 9252 kind2 = PyUnicode_KIND(s2); 9253 if (kind1 < kind2) 9254 return -1; 9255 9256 len1 = PyUnicode_GET_LENGTH(s1); 9257 len2 = PyUnicode_GET_LENGTH(s2); 9258 ADJUST_INDICES(start, end, len1); 9259 if (end - start < len2) 9260 return -1; 9261 9262 buf1 = PyUnicode_DATA(s1); 9263 buf2 = PyUnicode_DATA(s2); 9264 if (len2 == 1) { 9265 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9266 result = findchar((const char *)buf1 + kind1*start, 9267 kind1, end - start, ch, direction); 9268 if (result == -1) 9269 return -1; 9270 else 9271 return start + result; 9272 } 9273 9274 if (kind2 != kind1) { 9275 buf2 = _PyUnicode_AsKind(s2, kind1); 9276 if (!buf2) 9277 return -2; 9278 } 9279 9280 if (direction > 0) { 9281 switch (kind1) { 9282 case PyUnicode_1BYTE_KIND: 9283 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9284 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9285 else 9286 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9287 break; 9288 case PyUnicode_2BYTE_KIND: 9289 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9290 break; 9291 case PyUnicode_4BYTE_KIND: 9292 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9293 break; 9294 default: 9295 assert(0); result = -2; 9296 } 9297 } 9298 else { 9299 switch (kind1) { 9300 case PyUnicode_1BYTE_KIND: 9301 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9302 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9303 else 9304 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9305 break; 9306 case PyUnicode_2BYTE_KIND: 9307 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9308 break; 9309 case PyUnicode_4BYTE_KIND: 9310 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9311 break; 9312 default: 9313 assert(0); result = -2; 9314 } 9315 } 9316 9317 if (kind2 != kind1) 9318 PyMem_Free(buf2); 9319 9320 return result; 9321 } 9322 9323 Py_ssize_t 9324 _PyUnicode_InsertThousandsGrouping( 9325 PyObject *unicode, Py_ssize_t index, 9326 Py_ssize_t n_buffer, 9327 void *digits, Py_ssize_t n_digits, 9328 Py_ssize_t min_width, 9329 const char *grouping, PyObject *thousands_sep, 9330 Py_UCS4 *maxchar) 9331 { 9332 unsigned int kind, thousands_sep_kind; 9333 char *data, *thousands_sep_data; 9334 Py_ssize_t thousands_sep_len; 9335 Py_ssize_t len; 9336 9337 if (unicode != NULL) { 9338 kind = PyUnicode_KIND(unicode); 9339 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9340 } 9341 else { 9342 kind = PyUnicode_1BYTE_KIND; 9343 data = NULL; 9344 } 9345 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9346 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9347 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9348 if (unicode != NULL && thousands_sep_kind != kind) { 9349 if (thousands_sep_kind < kind) { 9350 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9351 if (!thousands_sep_data) 9352 return -1; 9353 } 9354 else { 9355 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9356 if (!data) 9357 return -1; 9358 } 9359 } 9360 9361 switch (kind) { 9362 case PyUnicode_1BYTE_KIND: 9363 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9364 len = asciilib_InsertThousandsGrouping( 9365 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9366 min_width, grouping, 9367 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9368 else 9369 len = ucs1lib_InsertThousandsGrouping( 9370 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9371 min_width, grouping, 9372 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9373 break; 9374 case PyUnicode_2BYTE_KIND: 9375 len = ucs2lib_InsertThousandsGrouping( 9376 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9377 min_width, grouping, 9378 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9379 break; 9380 case PyUnicode_4BYTE_KIND: 9381 len = ucs4lib_InsertThousandsGrouping( 9382 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9383 min_width, grouping, 9384 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9385 break; 9386 default: 9387 assert(0); 9388 return -1; 9389 } 9390 if (unicode != NULL && thousands_sep_kind != kind) { 9391 if (thousands_sep_kind < kind) 9392 PyMem_Free(thousands_sep_data); 9393 else 9394 PyMem_Free(data); 9395 } 9396 if (unicode == NULL) { 9397 *maxchar = 127; 9398 if (len != n_digits) { 9399 *maxchar = Py_MAX(*maxchar, 9400 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9401 } 9402 } 9403 return len; 9404 } 9405 9406 9407 Py_ssize_t 9408 PyUnicode_Count(PyObject *str, 9409 PyObject *substr, 9410 Py_ssize_t start, 9411 Py_ssize_t end) 9412 { 9413 Py_ssize_t result; 9414 int kind1, kind2; 9415 void *buf1 = NULL, *buf2 = NULL; 9416 Py_ssize_t len1, len2; 9417 9418 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9419 return -1; 9420 9421 kind1 = PyUnicode_KIND(str); 9422 kind2 = PyUnicode_KIND(substr); 9423 if (kind1 < kind2) 9424 return 0; 9425 9426 len1 = PyUnicode_GET_LENGTH(str); 9427 len2 = PyUnicode_GET_LENGTH(substr); 9428 ADJUST_INDICES(start, end, len1); 9429 if (end - start < len2) 9430 return 0; 9431 9432 buf1 = PyUnicode_DATA(str); 9433 buf2 = PyUnicode_DATA(substr); 9434 if (kind2 != kind1) { 9435 buf2 = _PyUnicode_AsKind(substr, kind1); 9436 if (!buf2) 9437 goto onError; 9438 } 9439 9440 switch (kind1) { 9441 case PyUnicode_1BYTE_KIND: 9442 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 9443 result = asciilib_count( 9444 ((Py_UCS1*)buf1) + start, end - start, 9445 buf2, len2, PY_SSIZE_T_MAX 9446 ); 9447 else 9448 result = ucs1lib_count( 9449 ((Py_UCS1*)buf1) + start, end - start, 9450 buf2, len2, PY_SSIZE_T_MAX 9451 ); 9452 break; 9453 case PyUnicode_2BYTE_KIND: 9454 result = ucs2lib_count( 9455 ((Py_UCS2*)buf1) + start, end - start, 9456 buf2, len2, PY_SSIZE_T_MAX 9457 ); 9458 break; 9459 case PyUnicode_4BYTE_KIND: 9460 result = ucs4lib_count( 9461 ((Py_UCS4*)buf1) + start, end - start, 9462 buf2, len2, PY_SSIZE_T_MAX 9463 ); 9464 break; 9465 default: 9466 assert(0); result = 0; 9467 } 9468 9469 if (kind2 != kind1) 9470 PyMem_Free(buf2); 9471 9472 return result; 9473 onError: 9474 if (kind2 != kind1 && buf2) 9475 PyMem_Free(buf2); 9476 return -1; 9477 } 9478 9479 Py_ssize_t 9480 PyUnicode_Find(PyObject *str, 9481 PyObject *substr, 9482 Py_ssize_t start, 9483 Py_ssize_t end, 9484 int direction) 9485 { 9486 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9487 return -2; 9488 9489 return any_find_slice(str, substr, start, end, direction); 9490 } 9491 9492 Py_ssize_t 9493 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9494 Py_ssize_t start, Py_ssize_t end, 9495 int direction) 9496 { 9497 int kind; 9498 Py_ssize_t result; 9499 if (PyUnicode_READY(str) == -1) 9500 return -2; 9501 if (start < 0 || end < 0) { 9502 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9503 return -2; 9504 } 9505 if (end > PyUnicode_GET_LENGTH(str)) 9506 end = PyUnicode_GET_LENGTH(str); 9507 if (start >= end) 9508 return -1; 9509 kind = PyUnicode_KIND(str); 9510 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9511 kind, end-start, ch, direction); 9512 if (result == -1) 9513 return -1; 9514 else 9515 return start + result; 9516 } 9517 9518 static int 9519 tailmatch(PyObject *self, 9520 PyObject *substring, 9521 Py_ssize_t start, 9522 Py_ssize_t end, 9523 int direction) 9524 { 9525 int kind_self; 9526 int kind_sub; 9527 void *data_self; 9528 void *data_sub; 9529 Py_ssize_t offset; 9530 Py_ssize_t i; 9531 Py_ssize_t end_sub; 9532 9533 if (PyUnicode_READY(self) == -1 || 9534 PyUnicode_READY(substring) == -1) 9535 return -1; 9536 9537 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9538 end -= PyUnicode_GET_LENGTH(substring); 9539 if (end < start) 9540 return 0; 9541 9542 if (PyUnicode_GET_LENGTH(substring) == 0) 9543 return 1; 9544 9545 kind_self = PyUnicode_KIND(self); 9546 data_self = PyUnicode_DATA(self); 9547 kind_sub = PyUnicode_KIND(substring); 9548 data_sub = PyUnicode_DATA(substring); 9549 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9550 9551 if (direction > 0) 9552 offset = end; 9553 else 9554 offset = start; 9555 9556 if (PyUnicode_READ(kind_self, data_self, offset) == 9557 PyUnicode_READ(kind_sub, data_sub, 0) && 9558 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9559 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9560 /* If both are of the same kind, memcmp is sufficient */ 9561 if (kind_self == kind_sub) { 9562 return ! memcmp((char *)data_self + 9563 (offset * PyUnicode_KIND(substring)), 9564 data_sub, 9565 PyUnicode_GET_LENGTH(substring) * 9566 PyUnicode_KIND(substring)); 9567 } 9568 /* otherwise we have to compare each character by first accessing it */ 9569 else { 9570 /* We do not need to compare 0 and len(substring)-1 because 9571 the if statement above ensured already that they are equal 9572 when we end up here. */ 9573 for (i = 1; i < end_sub; ++i) { 9574 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9575 PyUnicode_READ(kind_sub, data_sub, i)) 9576 return 0; 9577 } 9578 return 1; 9579 } 9580 } 9581 9582 return 0; 9583 } 9584 9585 Py_ssize_t 9586 PyUnicode_Tailmatch(PyObject *str, 9587 PyObject *substr, 9588 Py_ssize_t start, 9589 Py_ssize_t end, 9590 int direction) 9591 { 9592 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9593 return -1; 9594 9595 return tailmatch(str, substr, start, end, direction); 9596 } 9597 9598 /* Apply fixfct filter to the Unicode object self and return a 9599 reference to the modified object */ 9600 9601 static PyObject * 9602 fixup(PyObject *self, 9603 Py_UCS4 (*fixfct)(PyObject *s)) 9604 { 9605 PyObject *u; 9606 Py_UCS4 maxchar_old, maxchar_new = 0; 9607 PyObject *v; 9608 9609 u = _PyUnicode_Copy(self); 9610 if (u == NULL) 9611 return NULL; 9612 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9613 9614 /* fix functions return the new maximum character in a string, 9615 if the kind of the resulting unicode object does not change, 9616 everything is fine. Otherwise we need to change the string kind 9617 and re-run the fix function. */ 9618 maxchar_new = fixfct(u); 9619 9620 if (maxchar_new == 0) { 9621 /* no changes */; 9622 if (PyUnicode_CheckExact(self)) { 9623 Py_DECREF(u); 9624 Py_INCREF(self); 9625 return self; 9626 } 9627 else 9628 return u; 9629 } 9630 9631 maxchar_new = align_maxchar(maxchar_new); 9632 9633 if (maxchar_new == maxchar_old) 9634 return u; 9635 9636 /* In case the maximum character changed, we need to 9637 convert the string to the new category. */ 9638 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9639 if (v == NULL) { 9640 Py_DECREF(u); 9641 return NULL; 9642 } 9643 if (maxchar_new > maxchar_old) { 9644 /* If the maxchar increased so that the kind changed, not all 9645 characters are representable anymore and we need to fix the 9646 string again. This only happens in very few cases. */ 9647 _PyUnicode_FastCopyCharacters(v, 0, 9648 self, 0, PyUnicode_GET_LENGTH(self)); 9649 maxchar_old = fixfct(v); 9650 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9651 } 9652 else { 9653 _PyUnicode_FastCopyCharacters(v, 0, 9654 u, 0, PyUnicode_GET_LENGTH(self)); 9655 } 9656 Py_DECREF(u); 9657 assert(_PyUnicode_CheckConsistency(v, 1)); 9658 return v; 9659 } 9660 9661 static PyObject * 9662 ascii_upper_or_lower(PyObject *self, int lower) 9663 { 9664 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9665 char *resdata, *data = PyUnicode_DATA(self); 9666 PyObject *res; 9667 9668 res = PyUnicode_New(len, 127); 9669 if (res == NULL) 9670 return NULL; 9671 resdata = PyUnicode_DATA(res); 9672 if (lower) 9673 _Py_bytes_lower(resdata, data, len); 9674 else 9675 _Py_bytes_upper(resdata, data, len); 9676 return res; 9677 } 9678 9679 static Py_UCS4 9680 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9681 { 9682 Py_ssize_t j; 9683 int final_sigma; 9684 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9685 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9686 9687 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9688 9689 where ! is a negation and \p{xxx} is a character with property xxx. 9690 */ 9691 for (j = i - 1; j >= 0; j--) { 9692 c = PyUnicode_READ(kind, data, j); 9693 if (!_PyUnicode_IsCaseIgnorable(c)) 9694 break; 9695 } 9696 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9697 if (final_sigma) { 9698 for (j = i + 1; j < length; j++) { 9699 c = PyUnicode_READ(kind, data, j); 9700 if (!_PyUnicode_IsCaseIgnorable(c)) 9701 break; 9702 } 9703 final_sigma = j == length || !_PyUnicode_IsCased(c); 9704 } 9705 return (final_sigma) ? 0x3C2 : 0x3C3; 9706 } 9707 9708 static int 9709 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9710 Py_UCS4 c, Py_UCS4 *mapped) 9711 { 9712 /* Obscure special case. */ 9713 if (c == 0x3A3) { 9714 mapped[0] = handle_capital_sigma(kind, data, length, i); 9715 return 1; 9716 } 9717 return _PyUnicode_ToLowerFull(c, mapped); 9718 } 9719 9720 static Py_ssize_t 9721 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9722 { 9723 Py_ssize_t i, k = 0; 9724 int n_res, j; 9725 Py_UCS4 c, mapped[3]; 9726 9727 c = PyUnicode_READ(kind, data, 0); 9728 n_res = _PyUnicode_ToUpperFull(c, mapped); 9729 for (j = 0; j < n_res; j++) { 9730 *maxchar = Py_MAX(*maxchar, mapped[j]); 9731 res[k++] = mapped[j]; 9732 } 9733 for (i = 1; i < length; i++) { 9734 c = PyUnicode_READ(kind, data, i); 9735 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9736 for (j = 0; j < n_res; j++) { 9737 *maxchar = Py_MAX(*maxchar, mapped[j]); 9738 res[k++] = mapped[j]; 9739 } 9740 } 9741 return k; 9742 } 9743 9744 static Py_ssize_t 9745 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9746 Py_ssize_t i, k = 0; 9747 9748 for (i = 0; i < length; i++) { 9749 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9750 int n_res, j; 9751 if (Py_UNICODE_ISUPPER(c)) { 9752 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9753 } 9754 else if (Py_UNICODE_ISLOWER(c)) { 9755 n_res = _PyUnicode_ToUpperFull(c, mapped); 9756 } 9757 else { 9758 n_res = 1; 9759 mapped[0] = c; 9760 } 9761 for (j = 0; j < n_res; j++) { 9762 *maxchar = Py_MAX(*maxchar, mapped[j]); 9763 res[k++] = mapped[j]; 9764 } 9765 } 9766 return k; 9767 } 9768 9769 static Py_ssize_t 9770 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9771 Py_UCS4 *maxchar, int lower) 9772 { 9773 Py_ssize_t i, k = 0; 9774 9775 for (i = 0; i < length; i++) { 9776 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9777 int n_res, j; 9778 if (lower) 9779 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9780 else 9781 n_res = _PyUnicode_ToUpperFull(c, mapped); 9782 for (j = 0; j < n_res; j++) { 9783 *maxchar = Py_MAX(*maxchar, mapped[j]); 9784 res[k++] = mapped[j]; 9785 } 9786 } 9787 return k; 9788 } 9789 9790 static Py_ssize_t 9791 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9792 { 9793 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9794 } 9795 9796 static Py_ssize_t 9797 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9798 { 9799 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9800 } 9801 9802 static Py_ssize_t 9803 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9804 { 9805 Py_ssize_t i, k = 0; 9806 9807 for (i = 0; i < length; i++) { 9808 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9809 Py_UCS4 mapped[3]; 9810 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9811 for (j = 0; j < n_res; j++) { 9812 *maxchar = Py_MAX(*maxchar, mapped[j]); 9813 res[k++] = mapped[j]; 9814 } 9815 } 9816 return k; 9817 } 9818 9819 static Py_ssize_t 9820 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9821 { 9822 Py_ssize_t i, k = 0; 9823 int previous_is_cased; 9824 9825 previous_is_cased = 0; 9826 for (i = 0; i < length; i++) { 9827 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9828 Py_UCS4 mapped[3]; 9829 int n_res, j; 9830 9831 if (previous_is_cased) 9832 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9833 else 9834 n_res = _PyUnicode_ToTitleFull(c, mapped); 9835 9836 for (j = 0; j < n_res; j++) { 9837 *maxchar = Py_MAX(*maxchar, mapped[j]); 9838 res[k++] = mapped[j]; 9839 } 9840 9841 previous_is_cased = _PyUnicode_IsCased(c); 9842 } 9843 return k; 9844 } 9845 9846 static PyObject * 9847 case_operation(PyObject *self, 9848 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9849 { 9850 PyObject *res = NULL; 9851 Py_ssize_t length, newlength = 0; 9852 int kind, outkind; 9853 void *data, *outdata; 9854 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9855 9856 assert(PyUnicode_IS_READY(self)); 9857 9858 kind = PyUnicode_KIND(self); 9859 data = PyUnicode_DATA(self); 9860 length = PyUnicode_GET_LENGTH(self); 9861 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9862 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9863 return NULL; 9864 } 9865 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9866 if (tmp == NULL) 9867 return PyErr_NoMemory(); 9868 newlength = perform(kind, data, length, tmp, &maxchar); 9869 res = PyUnicode_New(newlength, maxchar); 9870 if (res == NULL) 9871 goto leave; 9872 tmpend = tmp + newlength; 9873 outdata = PyUnicode_DATA(res); 9874 outkind = PyUnicode_KIND(res); 9875 switch (outkind) { 9876 case PyUnicode_1BYTE_KIND: 9877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9878 break; 9879 case PyUnicode_2BYTE_KIND: 9880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9881 break; 9882 case PyUnicode_4BYTE_KIND: 9883 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9884 break; 9885 default: 9886 assert(0); 9887 break; 9888 } 9889 leave: 9890 PyMem_FREE(tmp); 9891 return res; 9892 } 9893 9894 PyObject * 9895 PyUnicode_Join(PyObject *separator, PyObject *seq) 9896 { 9897 PyObject *res; 9898 PyObject *fseq; 9899 Py_ssize_t seqlen; 9900 PyObject **items; 9901 9902 fseq = PySequence_Fast(seq, "can only join an iterable"); 9903 if (fseq == NULL) { 9904 return NULL; 9905 } 9906 9907 /* NOTE: the following code can't call back into Python code, 9908 * so we are sure that fseq won't be mutated. 9909 */ 9910 9911 items = PySequence_Fast_ITEMS(fseq); 9912 seqlen = PySequence_Fast_GET_SIZE(fseq); 9913 res = _PyUnicode_JoinArray(separator, items, seqlen); 9914 Py_DECREF(fseq); 9915 return res; 9916 } 9917 9918 PyObject * 9919 _PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen) 9920 { 9921 PyObject *res = NULL; /* the result */ 9922 PyObject *sep = NULL; 9923 Py_ssize_t seplen; 9924 PyObject *item; 9925 Py_ssize_t sz, i, res_offset; 9926 Py_UCS4 maxchar; 9927 Py_UCS4 item_maxchar; 9928 int use_memcpy; 9929 unsigned char *res_data = NULL, *sep_data = NULL; 9930 PyObject *last_obj; 9931 unsigned int kind = 0; 9932 9933 /* If empty sequence, return u"". */ 9934 if (seqlen == 0) { 9935 _Py_RETURN_UNICODE_EMPTY(); 9936 } 9937 9938 /* If singleton sequence with an exact Unicode, return that. */ 9939 last_obj = NULL; 9940 if (seqlen == 1) { 9941 if (PyUnicode_CheckExact(items[0])) { 9942 res = items[0]; 9943 Py_INCREF(res); 9944 return res; 9945 } 9946 seplen = 0; 9947 maxchar = 0; 9948 } 9949 else { 9950 /* Set up sep and seplen */ 9951 if (separator == NULL) { 9952 /* fall back to a blank space separator */ 9953 sep = PyUnicode_FromOrdinal(' '); 9954 if (!sep) 9955 goto onError; 9956 seplen = 1; 9957 maxchar = 32; 9958 } 9959 else { 9960 if (!PyUnicode_Check(separator)) { 9961 PyErr_Format(PyExc_TypeError, 9962 "separator: expected str instance," 9963 " %.80s found", 9964 Py_TYPE(separator)->tp_name); 9965 goto onError; 9966 } 9967 if (PyUnicode_READY(separator)) 9968 goto onError; 9969 sep = separator; 9970 seplen = PyUnicode_GET_LENGTH(separator); 9971 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9972 /* inc refcount to keep this code path symmetric with the 9973 above case of a blank separator */ 9974 Py_INCREF(sep); 9975 } 9976 last_obj = sep; 9977 } 9978 9979 /* There are at least two things to join, or else we have a subclass 9980 * of str in the sequence. 9981 * Do a pre-pass to figure out the total amount of space we'll 9982 * need (sz), and see whether all argument are strings. 9983 */ 9984 sz = 0; 9985 #ifdef Py_DEBUG 9986 use_memcpy = 0; 9987 #else 9988 use_memcpy = 1; 9989 #endif 9990 for (i = 0; i < seqlen; i++) { 9991 size_t add_sz; 9992 item = items[i]; 9993 if (!PyUnicode_Check(item)) { 9994 PyErr_Format(PyExc_TypeError, 9995 "sequence item %zd: expected str instance," 9996 " %.80s found", 9997 i, Py_TYPE(item)->tp_name); 9998 goto onError; 9999 } 10000 if (PyUnicode_READY(item) == -1) 10001 goto onError; 10002 add_sz = PyUnicode_GET_LENGTH(item); 10003 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 10004 maxchar = Py_MAX(maxchar, item_maxchar); 10005 if (i != 0) { 10006 add_sz += seplen; 10007 } 10008 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) { 10009 PyErr_SetString(PyExc_OverflowError, 10010 "join() result is too long for a Python string"); 10011 goto onError; 10012 } 10013 sz += add_sz; 10014 if (use_memcpy && last_obj != NULL) { 10015 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 10016 use_memcpy = 0; 10017 } 10018 last_obj = item; 10019 } 10020 10021 res = PyUnicode_New(sz, maxchar); 10022 if (res == NULL) 10023 goto onError; 10024 10025 /* Catenate everything. */ 10026 #ifdef Py_DEBUG 10027 use_memcpy = 0; 10028 #else 10029 if (use_memcpy) { 10030 res_data = PyUnicode_1BYTE_DATA(res); 10031 kind = PyUnicode_KIND(res); 10032 if (seplen != 0) 10033 sep_data = PyUnicode_1BYTE_DATA(sep); 10034 } 10035 #endif 10036 if (use_memcpy) { 10037 for (i = 0; i < seqlen; ++i) { 10038 Py_ssize_t itemlen; 10039 item = items[i]; 10040 10041 /* Copy item, and maybe the separator. */ 10042 if (i && seplen != 0) { 10043 memcpy(res_data, 10044 sep_data, 10045 kind * seplen); 10046 res_data += kind * seplen; 10047 } 10048 10049 itemlen = PyUnicode_GET_LENGTH(item); 10050 if (itemlen != 0) { 10051 memcpy(res_data, 10052 PyUnicode_DATA(item), 10053 kind * itemlen); 10054 res_data += kind * itemlen; 10055 } 10056 } 10057 assert(res_data == PyUnicode_1BYTE_DATA(res) 10058 + kind * PyUnicode_GET_LENGTH(res)); 10059 } 10060 else { 10061 for (i = 0, res_offset = 0; i < seqlen; ++i) { 10062 Py_ssize_t itemlen; 10063 item = items[i]; 10064 10065 /* Copy item, and maybe the separator. */ 10066 if (i && seplen != 0) { 10067 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 10068 res_offset += seplen; 10069 } 10070 10071 itemlen = PyUnicode_GET_LENGTH(item); 10072 if (itemlen != 0) { 10073 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 10074 res_offset += itemlen; 10075 } 10076 } 10077 assert(res_offset == PyUnicode_GET_LENGTH(res)); 10078 } 10079 10080 Py_XDECREF(sep); 10081 assert(_PyUnicode_CheckConsistency(res, 1)); 10082 return res; 10083 10084 onError: 10085 Py_XDECREF(sep); 10086 Py_XDECREF(res); 10087 return NULL; 10088 } 10089 10090 #define FILL(kind, data, value, start, length) \ 10091 do { \ 10092 Py_ssize_t i_ = 0; \ 10093 assert(kind != PyUnicode_WCHAR_KIND); \ 10094 switch ((kind)) { \ 10095 case PyUnicode_1BYTE_KIND: { \ 10096 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 10097 memset(to_, (unsigned char)value, (length)); \ 10098 break; \ 10099 } \ 10100 case PyUnicode_2BYTE_KIND: { \ 10101 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 10102 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 10103 break; \ 10104 } \ 10105 case PyUnicode_4BYTE_KIND: { \ 10106 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 10107 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 10108 break; \ 10109 } \ 10110 default: assert(0); \ 10111 } \ 10112 } while (0) 10113 10114 void 10115 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10116 Py_UCS4 fill_char) 10117 { 10118 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 10119 const void *data = PyUnicode_DATA(unicode); 10120 assert(PyUnicode_IS_READY(unicode)); 10121 assert(unicode_modifiable(unicode)); 10122 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 10123 assert(start >= 0); 10124 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 10125 FILL(kind, data, fill_char, start, length); 10126 } 10127 10128 Py_ssize_t 10129 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10130 Py_UCS4 fill_char) 10131 { 10132 Py_ssize_t maxlen; 10133 10134 if (!PyUnicode_Check(unicode)) { 10135 PyErr_BadInternalCall(); 10136 return -1; 10137 } 10138 if (PyUnicode_READY(unicode) == -1) 10139 return -1; 10140 if (unicode_check_modifiable(unicode)) 10141 return -1; 10142 10143 if (start < 0) { 10144 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10145 return -1; 10146 } 10147 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 10148 PyErr_SetString(PyExc_ValueError, 10149 "fill character is bigger than " 10150 "the string maximum character"); 10151 return -1; 10152 } 10153 10154 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 10155 length = Py_MIN(maxlen, length); 10156 if (length <= 0) 10157 return 0; 10158 10159 _PyUnicode_FastFill(unicode, start, length, fill_char); 10160 return length; 10161 } 10162 10163 static PyObject * 10164 pad(PyObject *self, 10165 Py_ssize_t left, 10166 Py_ssize_t right, 10167 Py_UCS4 fill) 10168 { 10169 PyObject *u; 10170 Py_UCS4 maxchar; 10171 int kind; 10172 void *data; 10173 10174 if (left < 0) 10175 left = 0; 10176 if (right < 0) 10177 right = 0; 10178 10179 if (left == 0 && right == 0) 10180 return unicode_result_unchanged(self); 10181 10182 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 10183 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 10184 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 10185 return NULL; 10186 } 10187 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10188 maxchar = Py_MAX(maxchar, fill); 10189 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10190 if (!u) 10191 return NULL; 10192 10193 kind = PyUnicode_KIND(u); 10194 data = PyUnicode_DATA(u); 10195 if (left) 10196 FILL(kind, data, fill, 0, left); 10197 if (right) 10198 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10199 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10200 assert(_PyUnicode_CheckConsistency(u, 1)); 10201 return u; 10202 } 10203 10204 PyObject * 10205 PyUnicode_Splitlines(PyObject *string, int keepends) 10206 { 10207 PyObject *list; 10208 10209 if (ensure_unicode(string) < 0) 10210 return NULL; 10211 10212 switch (PyUnicode_KIND(string)) { 10213 case PyUnicode_1BYTE_KIND: 10214 if (PyUnicode_IS_ASCII(string)) 10215 list = asciilib_splitlines( 10216 string, PyUnicode_1BYTE_DATA(string), 10217 PyUnicode_GET_LENGTH(string), keepends); 10218 else 10219 list = ucs1lib_splitlines( 10220 string, PyUnicode_1BYTE_DATA(string), 10221 PyUnicode_GET_LENGTH(string), keepends); 10222 break; 10223 case PyUnicode_2BYTE_KIND: 10224 list = ucs2lib_splitlines( 10225 string, PyUnicode_2BYTE_DATA(string), 10226 PyUnicode_GET_LENGTH(string), keepends); 10227 break; 10228 case PyUnicode_4BYTE_KIND: 10229 list = ucs4lib_splitlines( 10230 string, PyUnicode_4BYTE_DATA(string), 10231 PyUnicode_GET_LENGTH(string), keepends); 10232 break; 10233 default: 10234 assert(0); 10235 list = 0; 10236 } 10237 return list; 10238 } 10239 10240 static PyObject * 10241 split(PyObject *self, 10242 PyObject *substring, 10243 Py_ssize_t maxcount) 10244 { 10245 int kind1, kind2; 10246 void *buf1, *buf2; 10247 Py_ssize_t len1, len2; 10248 PyObject* out; 10249 10250 if (maxcount < 0) 10251 maxcount = PY_SSIZE_T_MAX; 10252 10253 if (PyUnicode_READY(self) == -1) 10254 return NULL; 10255 10256 if (substring == NULL) 10257 switch (PyUnicode_KIND(self)) { 10258 case PyUnicode_1BYTE_KIND: 10259 if (PyUnicode_IS_ASCII(self)) 10260 return asciilib_split_whitespace( 10261 self, PyUnicode_1BYTE_DATA(self), 10262 PyUnicode_GET_LENGTH(self), maxcount 10263 ); 10264 else 10265 return ucs1lib_split_whitespace( 10266 self, PyUnicode_1BYTE_DATA(self), 10267 PyUnicode_GET_LENGTH(self), maxcount 10268 ); 10269 case PyUnicode_2BYTE_KIND: 10270 return ucs2lib_split_whitespace( 10271 self, PyUnicode_2BYTE_DATA(self), 10272 PyUnicode_GET_LENGTH(self), maxcount 10273 ); 10274 case PyUnicode_4BYTE_KIND: 10275 return ucs4lib_split_whitespace( 10276 self, PyUnicode_4BYTE_DATA(self), 10277 PyUnicode_GET_LENGTH(self), maxcount 10278 ); 10279 default: 10280 assert(0); 10281 return NULL; 10282 } 10283 10284 if (PyUnicode_READY(substring) == -1) 10285 return NULL; 10286 10287 kind1 = PyUnicode_KIND(self); 10288 kind2 = PyUnicode_KIND(substring); 10289 len1 = PyUnicode_GET_LENGTH(self); 10290 len2 = PyUnicode_GET_LENGTH(substring); 10291 if (kind1 < kind2 || len1 < len2) { 10292 out = PyList_New(1); 10293 if (out == NULL) 10294 return NULL; 10295 Py_INCREF(self); 10296 PyList_SET_ITEM(out, 0, self); 10297 return out; 10298 } 10299 buf1 = PyUnicode_DATA(self); 10300 buf2 = PyUnicode_DATA(substring); 10301 if (kind2 != kind1) { 10302 buf2 = _PyUnicode_AsKind(substring, kind1); 10303 if (!buf2) 10304 return NULL; 10305 } 10306 10307 switch (kind1) { 10308 case PyUnicode_1BYTE_KIND: 10309 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10310 out = asciilib_split( 10311 self, buf1, len1, buf2, len2, maxcount); 10312 else 10313 out = ucs1lib_split( 10314 self, buf1, len1, buf2, len2, maxcount); 10315 break; 10316 case PyUnicode_2BYTE_KIND: 10317 out = ucs2lib_split( 10318 self, buf1, len1, buf2, len2, maxcount); 10319 break; 10320 case PyUnicode_4BYTE_KIND: 10321 out = ucs4lib_split( 10322 self, buf1, len1, buf2, len2, maxcount); 10323 break; 10324 default: 10325 out = NULL; 10326 } 10327 if (kind2 != kind1) 10328 PyMem_Free(buf2); 10329 return out; 10330 } 10331 10332 static PyObject * 10333 rsplit(PyObject *self, 10334 PyObject *substring, 10335 Py_ssize_t maxcount) 10336 { 10337 int kind1, kind2; 10338 void *buf1, *buf2; 10339 Py_ssize_t len1, len2; 10340 PyObject* out; 10341 10342 if (maxcount < 0) 10343 maxcount = PY_SSIZE_T_MAX; 10344 10345 if (PyUnicode_READY(self) == -1) 10346 return NULL; 10347 10348 if (substring == NULL) 10349 switch (PyUnicode_KIND(self)) { 10350 case PyUnicode_1BYTE_KIND: 10351 if (PyUnicode_IS_ASCII(self)) 10352 return asciilib_rsplit_whitespace( 10353 self, PyUnicode_1BYTE_DATA(self), 10354 PyUnicode_GET_LENGTH(self), maxcount 10355 ); 10356 else 10357 return ucs1lib_rsplit_whitespace( 10358 self, PyUnicode_1BYTE_DATA(self), 10359 PyUnicode_GET_LENGTH(self), maxcount 10360 ); 10361 case PyUnicode_2BYTE_KIND: 10362 return ucs2lib_rsplit_whitespace( 10363 self, PyUnicode_2BYTE_DATA(self), 10364 PyUnicode_GET_LENGTH(self), maxcount 10365 ); 10366 case PyUnicode_4BYTE_KIND: 10367 return ucs4lib_rsplit_whitespace( 10368 self, PyUnicode_4BYTE_DATA(self), 10369 PyUnicode_GET_LENGTH(self), maxcount 10370 ); 10371 default: 10372 assert(0); 10373 return NULL; 10374 } 10375 10376 if (PyUnicode_READY(substring) == -1) 10377 return NULL; 10378 10379 kind1 = PyUnicode_KIND(self); 10380 kind2 = PyUnicode_KIND(substring); 10381 len1 = PyUnicode_GET_LENGTH(self); 10382 len2 = PyUnicode_GET_LENGTH(substring); 10383 if (kind1 < kind2 || len1 < len2) { 10384 out = PyList_New(1); 10385 if (out == NULL) 10386 return NULL; 10387 Py_INCREF(self); 10388 PyList_SET_ITEM(out, 0, self); 10389 return out; 10390 } 10391 buf1 = PyUnicode_DATA(self); 10392 buf2 = PyUnicode_DATA(substring); 10393 if (kind2 != kind1) { 10394 buf2 = _PyUnicode_AsKind(substring, kind1); 10395 if (!buf2) 10396 return NULL; 10397 } 10398 10399 switch (kind1) { 10400 case PyUnicode_1BYTE_KIND: 10401 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10402 out = asciilib_rsplit( 10403 self, buf1, len1, buf2, len2, maxcount); 10404 else 10405 out = ucs1lib_rsplit( 10406 self, buf1, len1, buf2, len2, maxcount); 10407 break; 10408 case PyUnicode_2BYTE_KIND: 10409 out = ucs2lib_rsplit( 10410 self, buf1, len1, buf2, len2, maxcount); 10411 break; 10412 case PyUnicode_4BYTE_KIND: 10413 out = ucs4lib_rsplit( 10414 self, buf1, len1, buf2, len2, maxcount); 10415 break; 10416 default: 10417 out = NULL; 10418 } 10419 if (kind2 != kind1) 10420 PyMem_Free(buf2); 10421 return out; 10422 } 10423 10424 static Py_ssize_t 10425 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10426 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10427 { 10428 switch (kind) { 10429 case PyUnicode_1BYTE_KIND: 10430 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10431 return asciilib_find(buf1, len1, buf2, len2, offset); 10432 else 10433 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10434 case PyUnicode_2BYTE_KIND: 10435 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10436 case PyUnicode_4BYTE_KIND: 10437 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10438 } 10439 assert(0); 10440 return -1; 10441 } 10442 10443 static Py_ssize_t 10444 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10445 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10446 { 10447 switch (kind) { 10448 case PyUnicode_1BYTE_KIND: 10449 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10450 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10451 else 10452 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10453 case PyUnicode_2BYTE_KIND: 10454 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10455 case PyUnicode_4BYTE_KIND: 10456 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10457 } 10458 assert(0); 10459 return 0; 10460 } 10461 10462 static void 10463 replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10464 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10465 { 10466 int kind = PyUnicode_KIND(u); 10467 void *data = PyUnicode_DATA(u); 10468 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10469 if (kind == PyUnicode_1BYTE_KIND) { 10470 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10471 (Py_UCS1 *)data + len, 10472 u1, u2, maxcount); 10473 } 10474 else if (kind == PyUnicode_2BYTE_KIND) { 10475 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10476 (Py_UCS2 *)data + len, 10477 u1, u2, maxcount); 10478 } 10479 else { 10480 assert(kind == PyUnicode_4BYTE_KIND); 10481 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10482 (Py_UCS4 *)data + len, 10483 u1, u2, maxcount); 10484 } 10485 } 10486 10487 static PyObject * 10488 replace(PyObject *self, PyObject *str1, 10489 PyObject *str2, Py_ssize_t maxcount) 10490 { 10491 PyObject *u; 10492 char *sbuf = PyUnicode_DATA(self); 10493 char *buf1 = PyUnicode_DATA(str1); 10494 char *buf2 = PyUnicode_DATA(str2); 10495 int srelease = 0, release1 = 0, release2 = 0; 10496 int skind = PyUnicode_KIND(self); 10497 int kind1 = PyUnicode_KIND(str1); 10498 int kind2 = PyUnicode_KIND(str2); 10499 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10500 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10501 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10502 int mayshrink; 10503 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10504 10505 if (maxcount < 0) 10506 maxcount = PY_SSIZE_T_MAX; 10507 else if (maxcount == 0 || slen == 0) 10508 goto nothing; 10509 10510 if (str1 == str2) 10511 goto nothing; 10512 10513 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10514 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10515 if (maxchar < maxchar_str1) 10516 /* substring too wide to be present */ 10517 goto nothing; 10518 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10519 /* Replacing str1 with str2 may cause a maxchar reduction in the 10520 result string. */ 10521 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10522 maxchar = Py_MAX(maxchar, maxchar_str2); 10523 10524 if (len1 == len2) { 10525 /* same length */ 10526 if (len1 == 0) 10527 goto nothing; 10528 if (len1 == 1) { 10529 /* replace characters */ 10530 Py_UCS4 u1, u2; 10531 Py_ssize_t pos; 10532 10533 u1 = PyUnicode_READ(kind1, buf1, 0); 10534 pos = findchar(sbuf, skind, slen, u1, 1); 10535 if (pos < 0) 10536 goto nothing; 10537 u2 = PyUnicode_READ(kind2, buf2, 0); 10538 u = PyUnicode_New(slen, maxchar); 10539 if (!u) 10540 goto error; 10541 10542 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10543 replace_1char_inplace(u, pos, u1, u2, maxcount); 10544 } 10545 else { 10546 int rkind = skind; 10547 char *res; 10548 Py_ssize_t i; 10549 10550 if (kind1 < rkind) { 10551 /* widen substring */ 10552 buf1 = _PyUnicode_AsKind(str1, rkind); 10553 if (!buf1) goto error; 10554 release1 = 1; 10555 } 10556 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10557 if (i < 0) 10558 goto nothing; 10559 if (rkind > kind2) { 10560 /* widen replacement */ 10561 buf2 = _PyUnicode_AsKind(str2, rkind); 10562 if (!buf2) goto error; 10563 release2 = 1; 10564 } 10565 else if (rkind < kind2) { 10566 /* widen self and buf1 */ 10567 rkind = kind2; 10568 if (release1) PyMem_Free(buf1); 10569 release1 = 0; 10570 sbuf = _PyUnicode_AsKind(self, rkind); 10571 if (!sbuf) goto error; 10572 srelease = 1; 10573 buf1 = _PyUnicode_AsKind(str1, rkind); 10574 if (!buf1) goto error; 10575 release1 = 1; 10576 } 10577 u = PyUnicode_New(slen, maxchar); 10578 if (!u) 10579 goto error; 10580 assert(PyUnicode_KIND(u) == rkind); 10581 res = PyUnicode_DATA(u); 10582 10583 memcpy(res, sbuf, rkind * slen); 10584 /* change everything in-place, starting with this one */ 10585 memcpy(res + rkind * i, 10586 buf2, 10587 rkind * len2); 10588 i += len1; 10589 10590 while ( --maxcount > 0) { 10591 i = anylib_find(rkind, self, 10592 sbuf+rkind*i, slen-i, 10593 str1, buf1, len1, i); 10594 if (i == -1) 10595 break; 10596 memcpy(res + rkind * i, 10597 buf2, 10598 rkind * len2); 10599 i += len1; 10600 } 10601 } 10602 } 10603 else { 10604 Py_ssize_t n, i, j, ires; 10605 Py_ssize_t new_size; 10606 int rkind = skind; 10607 char *res; 10608 10609 if (kind1 < rkind) { 10610 /* widen substring */ 10611 buf1 = _PyUnicode_AsKind(str1, rkind); 10612 if (!buf1) goto error; 10613 release1 = 1; 10614 } 10615 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10616 if (n == 0) 10617 goto nothing; 10618 if (kind2 < rkind) { 10619 /* widen replacement */ 10620 buf2 = _PyUnicode_AsKind(str2, rkind); 10621 if (!buf2) goto error; 10622 release2 = 1; 10623 } 10624 else if (kind2 > rkind) { 10625 /* widen self and buf1 */ 10626 rkind = kind2; 10627 sbuf = _PyUnicode_AsKind(self, rkind); 10628 if (!sbuf) goto error; 10629 srelease = 1; 10630 if (release1) PyMem_Free(buf1); 10631 release1 = 0; 10632 buf1 = _PyUnicode_AsKind(str1, rkind); 10633 if (!buf1) goto error; 10634 release1 = 1; 10635 } 10636 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10637 PyUnicode_GET_LENGTH(str1))); */ 10638 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10639 PyErr_SetString(PyExc_OverflowError, 10640 "replace string is too long"); 10641 goto error; 10642 } 10643 new_size = slen + n * (len2 - len1); 10644 if (new_size == 0) { 10645 _Py_INCREF_UNICODE_EMPTY(); 10646 if (!unicode_empty) 10647 goto error; 10648 u = unicode_empty; 10649 goto done; 10650 } 10651 if (new_size > (PY_SSIZE_T_MAX / rkind)) { 10652 PyErr_SetString(PyExc_OverflowError, 10653 "replace string is too long"); 10654 goto error; 10655 } 10656 u = PyUnicode_New(new_size, maxchar); 10657 if (!u) 10658 goto error; 10659 assert(PyUnicode_KIND(u) == rkind); 10660 res = PyUnicode_DATA(u); 10661 ires = i = 0; 10662 if (len1 > 0) { 10663 while (n-- > 0) { 10664 /* look for next match */ 10665 j = anylib_find(rkind, self, 10666 sbuf + rkind * i, slen-i, 10667 str1, buf1, len1, i); 10668 if (j == -1) 10669 break; 10670 else if (j > i) { 10671 /* copy unchanged part [i:j] */ 10672 memcpy(res + rkind * ires, 10673 sbuf + rkind * i, 10674 rkind * (j-i)); 10675 ires += j - i; 10676 } 10677 /* copy substitution string */ 10678 if (len2 > 0) { 10679 memcpy(res + rkind * ires, 10680 buf2, 10681 rkind * len2); 10682 ires += len2; 10683 } 10684 i = j + len1; 10685 } 10686 if (i < slen) 10687 /* copy tail [i:] */ 10688 memcpy(res + rkind * ires, 10689 sbuf + rkind * i, 10690 rkind * (slen-i)); 10691 } 10692 else { 10693 /* interleave */ 10694 while (n > 0) { 10695 memcpy(res + rkind * ires, 10696 buf2, 10697 rkind * len2); 10698 ires += len2; 10699 if (--n <= 0) 10700 break; 10701 memcpy(res + rkind * ires, 10702 sbuf + rkind * i, 10703 rkind); 10704 ires++; 10705 i++; 10706 } 10707 memcpy(res + rkind * ires, 10708 sbuf + rkind * i, 10709 rkind * (slen-i)); 10710 } 10711 } 10712 10713 if (mayshrink) { 10714 unicode_adjust_maxchar(&u); 10715 if (u == NULL) 10716 goto error; 10717 } 10718 10719 done: 10720 if (srelease) 10721 PyMem_FREE(sbuf); 10722 if (release1) 10723 PyMem_FREE(buf1); 10724 if (release2) 10725 PyMem_FREE(buf2); 10726 assert(_PyUnicode_CheckConsistency(u, 1)); 10727 return u; 10728 10729 nothing: 10730 /* nothing to replace; return original string (when possible) */ 10731 if (srelease) 10732 PyMem_FREE(sbuf); 10733 if (release1) 10734 PyMem_FREE(buf1); 10735 if (release2) 10736 PyMem_FREE(buf2); 10737 return unicode_result_unchanged(self); 10738 10739 error: 10740 if (srelease && sbuf) 10741 PyMem_FREE(sbuf); 10742 if (release1 && buf1) 10743 PyMem_FREE(buf1); 10744 if (release2 && buf2) 10745 PyMem_FREE(buf2); 10746 return NULL; 10747 } 10748 10749 /* --- Unicode Object Methods --------------------------------------------- */ 10750 10751 PyDoc_STRVAR(title__doc__, 10752 "S.title() -> str\n\ 10753 \n\ 10754 Return a titlecased version of S, i.e. words start with title case\n\ 10755 characters, all remaining cased characters have lower case."); 10756 10757 static PyObject* 10758 unicode_title(PyObject *self) 10759 { 10760 if (PyUnicode_READY(self) == -1) 10761 return NULL; 10762 return case_operation(self, do_title); 10763 } 10764 10765 PyDoc_STRVAR(capitalize__doc__, 10766 "S.capitalize() -> str\n\ 10767 \n\ 10768 Return a capitalized version of S, i.e. make the first character\n\ 10769 have upper case and the rest lower case."); 10770 10771 static PyObject* 10772 unicode_capitalize(PyObject *self) 10773 { 10774 if (PyUnicode_READY(self) == -1) 10775 return NULL; 10776 if (PyUnicode_GET_LENGTH(self) == 0) 10777 return unicode_result_unchanged(self); 10778 return case_operation(self, do_capitalize); 10779 } 10780 10781 PyDoc_STRVAR(casefold__doc__, 10782 "S.casefold() -> str\n\ 10783 \n\ 10784 Return a version of S suitable for caseless comparisons."); 10785 10786 static PyObject * 10787 unicode_casefold(PyObject *self) 10788 { 10789 if (PyUnicode_READY(self) == -1) 10790 return NULL; 10791 if (PyUnicode_IS_ASCII(self)) 10792 return ascii_upper_or_lower(self, 1); 10793 return case_operation(self, do_casefold); 10794 } 10795 10796 10797 /* Argument converter. Accepts a single Unicode character. */ 10798 10799 static int 10800 convert_uc(PyObject *obj, void *addr) 10801 { 10802 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10803 10804 if (!PyUnicode_Check(obj)) { 10805 PyErr_Format(PyExc_TypeError, 10806 "The fill character must be a unicode character, " 10807 "not %.100s", Py_TYPE(obj)->tp_name); 10808 return 0; 10809 } 10810 if (PyUnicode_READY(obj) < 0) 10811 return 0; 10812 if (PyUnicode_GET_LENGTH(obj) != 1) { 10813 PyErr_SetString(PyExc_TypeError, 10814 "The fill character must be exactly one character long"); 10815 return 0; 10816 } 10817 *fillcharloc = PyUnicode_READ_CHAR(obj, 0); 10818 return 1; 10819 } 10820 10821 PyDoc_STRVAR(center__doc__, 10822 "S.center(width[, fillchar]) -> str\n\ 10823 \n\ 10824 Return S centered in a string of length width. Padding is\n\ 10825 done using the specified fill character (default is a space)"); 10826 10827 static PyObject * 10828 unicode_center(PyObject *self, PyObject *args) 10829 { 10830 Py_ssize_t marg, left; 10831 Py_ssize_t width; 10832 Py_UCS4 fillchar = ' '; 10833 10834 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10835 return NULL; 10836 10837 if (PyUnicode_READY(self) == -1) 10838 return NULL; 10839 10840 if (PyUnicode_GET_LENGTH(self) >= width) 10841 return unicode_result_unchanged(self); 10842 10843 marg = width - PyUnicode_GET_LENGTH(self); 10844 left = marg / 2 + (marg & width & 1); 10845 10846 return pad(self, left, marg - left, fillchar); 10847 } 10848 10849 /* This function assumes that str1 and str2 are readied by the caller. */ 10850 10851 static int 10852 unicode_compare(PyObject *str1, PyObject *str2) 10853 { 10854 #define COMPARE(TYPE1, TYPE2) \ 10855 do { \ 10856 TYPE1* p1 = (TYPE1 *)data1; \ 10857 TYPE2* p2 = (TYPE2 *)data2; \ 10858 TYPE1* end = p1 + len; \ 10859 Py_UCS4 c1, c2; \ 10860 for (; p1 != end; p1++, p2++) { \ 10861 c1 = *p1; \ 10862 c2 = *p2; \ 10863 if (c1 != c2) \ 10864 return (c1 < c2) ? -1 : 1; \ 10865 } \ 10866 } \ 10867 while (0) 10868 10869 int kind1, kind2; 10870 void *data1, *data2; 10871 Py_ssize_t len1, len2, len; 10872 10873 kind1 = PyUnicode_KIND(str1); 10874 kind2 = PyUnicode_KIND(str2); 10875 data1 = PyUnicode_DATA(str1); 10876 data2 = PyUnicode_DATA(str2); 10877 len1 = PyUnicode_GET_LENGTH(str1); 10878 len2 = PyUnicode_GET_LENGTH(str2); 10879 len = Py_MIN(len1, len2); 10880 10881 switch(kind1) { 10882 case PyUnicode_1BYTE_KIND: 10883 { 10884 switch(kind2) { 10885 case PyUnicode_1BYTE_KIND: 10886 { 10887 int cmp = memcmp(data1, data2, len); 10888 /* normalize result of memcmp() into the range [-1; 1] */ 10889 if (cmp < 0) 10890 return -1; 10891 if (cmp > 0) 10892 return 1; 10893 break; 10894 } 10895 case PyUnicode_2BYTE_KIND: 10896 COMPARE(Py_UCS1, Py_UCS2); 10897 break; 10898 case PyUnicode_4BYTE_KIND: 10899 COMPARE(Py_UCS1, Py_UCS4); 10900 break; 10901 default: 10902 assert(0); 10903 } 10904 break; 10905 } 10906 case PyUnicode_2BYTE_KIND: 10907 { 10908 switch(kind2) { 10909 case PyUnicode_1BYTE_KIND: 10910 COMPARE(Py_UCS2, Py_UCS1); 10911 break; 10912 case PyUnicode_2BYTE_KIND: 10913 { 10914 COMPARE(Py_UCS2, Py_UCS2); 10915 break; 10916 } 10917 case PyUnicode_4BYTE_KIND: 10918 COMPARE(Py_UCS2, Py_UCS4); 10919 break; 10920 default: 10921 assert(0); 10922 } 10923 break; 10924 } 10925 case PyUnicode_4BYTE_KIND: 10926 { 10927 switch(kind2) { 10928 case PyUnicode_1BYTE_KIND: 10929 COMPARE(Py_UCS4, Py_UCS1); 10930 break; 10931 case PyUnicode_2BYTE_KIND: 10932 COMPARE(Py_UCS4, Py_UCS2); 10933 break; 10934 case PyUnicode_4BYTE_KIND: 10935 { 10936 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10937 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10938 /* normalize result of wmemcmp() into the range [-1; 1] */ 10939 if (cmp < 0) 10940 return -1; 10941 if (cmp > 0) 10942 return 1; 10943 #else 10944 COMPARE(Py_UCS4, Py_UCS4); 10945 #endif 10946 break; 10947 } 10948 default: 10949 assert(0); 10950 } 10951 break; 10952 } 10953 default: 10954 assert(0); 10955 } 10956 10957 if (len1 == len2) 10958 return 0; 10959 if (len1 < len2) 10960 return -1; 10961 else 10962 return 1; 10963 10964 #undef COMPARE 10965 } 10966 10967 static int 10968 unicode_compare_eq(PyObject *str1, PyObject *str2) 10969 { 10970 int kind; 10971 void *data1, *data2; 10972 Py_ssize_t len; 10973 int cmp; 10974 10975 len = PyUnicode_GET_LENGTH(str1); 10976 if (PyUnicode_GET_LENGTH(str2) != len) 10977 return 0; 10978 kind = PyUnicode_KIND(str1); 10979 if (PyUnicode_KIND(str2) != kind) 10980 return 0; 10981 data1 = PyUnicode_DATA(str1); 10982 data2 = PyUnicode_DATA(str2); 10983 10984 cmp = memcmp(data1, data2, len * kind); 10985 return (cmp == 0); 10986 } 10987 10988 10989 int 10990 PyUnicode_Compare(PyObject *left, PyObject *right) 10991 { 10992 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10993 if (PyUnicode_READY(left) == -1 || 10994 PyUnicode_READY(right) == -1) 10995 return -1; 10996 10997 /* a string is equal to itself */ 10998 if (left == right) 10999 return 0; 11000 11001 return unicode_compare(left, right); 11002 } 11003 PyErr_Format(PyExc_TypeError, 11004 "Can't compare %.100s and %.100s", 11005 left->ob_type->tp_name, 11006 right->ob_type->tp_name); 11007 return -1; 11008 } 11009 11010 int 11011 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 11012 { 11013 Py_ssize_t i; 11014 int kind; 11015 Py_UCS4 chr; 11016 const unsigned char *ustr = (const unsigned char *)str; 11017 11018 assert(_PyUnicode_CHECK(uni)); 11019 if (!PyUnicode_IS_READY(uni)) { 11020 const wchar_t *ws = _PyUnicode_WSTR(uni); 11021 /* Compare Unicode string and source character set string */ 11022 for (i = 0; (chr = ws[i]) && ustr[i]; i++) { 11023 if (chr != ustr[i]) 11024 return (chr < ustr[i]) ? -1 : 1; 11025 } 11026 /* This check keeps Python strings that end in '\0' from comparing equal 11027 to C strings identical up to that point. */ 11028 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr) 11029 return 1; /* uni is longer */ 11030 if (ustr[i]) 11031 return -1; /* str is longer */ 11032 return 0; 11033 } 11034 kind = PyUnicode_KIND(uni); 11035 if (kind == PyUnicode_1BYTE_KIND) { 11036 const void *data = PyUnicode_1BYTE_DATA(uni); 11037 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 11038 size_t len, len2 = strlen(str); 11039 int cmp; 11040 11041 len = Py_MIN(len1, len2); 11042 cmp = memcmp(data, str, len); 11043 if (cmp != 0) { 11044 if (cmp < 0) 11045 return -1; 11046 else 11047 return 1; 11048 } 11049 if (len1 > len2) 11050 return 1; /* uni is longer */ 11051 if (len1 < len2) 11052 return -1; /* str is longer */ 11053 return 0; 11054 } 11055 else { 11056 void *data = PyUnicode_DATA(uni); 11057 /* Compare Unicode string and source character set string */ 11058 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 11059 if (chr != (unsigned char)str[i]) 11060 return (chr < (unsigned char)(str[i])) ? -1 : 1; 11061 /* This check keeps Python strings that end in '\0' from comparing equal 11062 to C strings identical up to that point. */ 11063 if (PyUnicode_GET_LENGTH(uni) != i || chr) 11064 return 1; /* uni is longer */ 11065 if (str[i]) 11066 return -1; /* str is longer */ 11067 return 0; 11068 } 11069 } 11070 11071 static int 11072 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) 11073 { 11074 size_t i, len; 11075 const wchar_t *p; 11076 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); 11077 if (strlen(str) != len) 11078 return 0; 11079 p = _PyUnicode_WSTR(unicode); 11080 assert(p); 11081 for (i = 0; i < len; i++) { 11082 unsigned char c = (unsigned char)str[i]; 11083 if (c >= 128 || p[i] != (wchar_t)c) 11084 return 0; 11085 } 11086 return 1; 11087 } 11088 11089 int 11090 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) 11091 { 11092 size_t len; 11093 assert(_PyUnicode_CHECK(unicode)); 11094 assert(str); 11095 #ifndef NDEBUG 11096 for (const char *p = str; *p; p++) { 11097 assert((unsigned char)*p < 128); 11098 } 11099 #endif 11100 if (PyUnicode_READY(unicode) == -1) { 11101 /* Memory error or bad data */ 11102 PyErr_Clear(); 11103 return non_ready_unicode_equal_to_ascii_string(unicode, str); 11104 } 11105 if (!PyUnicode_IS_ASCII(unicode)) 11106 return 0; 11107 len = (size_t)PyUnicode_GET_LENGTH(unicode); 11108 return strlen(str) == len && 11109 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; 11110 } 11111 11112 int 11113 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) 11114 { 11115 PyObject *right_uni; 11116 Py_hash_t hash; 11117 11118 assert(_PyUnicode_CHECK(left)); 11119 assert(right->string); 11120 #ifndef NDEBUG 11121 for (const char *p = right->string; *p; p++) { 11122 assert((unsigned char)*p < 128); 11123 } 11124 #endif 11125 11126 if (PyUnicode_READY(left) == -1) { 11127 /* memory error or bad data */ 11128 PyErr_Clear(); 11129 return non_ready_unicode_equal_to_ascii_string(left, right->string); 11130 } 11131 11132 if (!PyUnicode_IS_ASCII(left)) 11133 return 0; 11134 11135 right_uni = _PyUnicode_FromId(right); /* borrowed */ 11136 if (right_uni == NULL) { 11137 /* memory error or bad data */ 11138 PyErr_Clear(); 11139 return _PyUnicode_EqualToASCIIString(left, right->string); 11140 } 11141 11142 if (left == right_uni) 11143 return 1; 11144 11145 if (PyUnicode_CHECK_INTERNED(left)) 11146 return 0; 11147 11148 assert(_PyUnicode_HASH(right_uni) != 1); 11149 hash = _PyUnicode_HASH(left); 11150 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) 11151 return 0; 11152 11153 return unicode_compare_eq(left, right_uni); 11154 } 11155 11156 #define TEST_COND(cond) \ 11157 ((cond) ? Py_True : Py_False) 11158 11159 PyObject * 11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 11161 { 11162 int result; 11163 PyObject *v; 11164 11165 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 11166 Py_RETURN_NOTIMPLEMENTED; 11167 11168 if (PyUnicode_READY(left) == -1 || 11169 PyUnicode_READY(right) == -1) 11170 return NULL; 11171 11172 if (left == right) { 11173 switch (op) { 11174 case Py_EQ: 11175 case Py_LE: 11176 case Py_GE: 11177 /* a string is equal to itself */ 11178 v = Py_True; 11179 break; 11180 case Py_NE: 11181 case Py_LT: 11182 case Py_GT: 11183 v = Py_False; 11184 break; 11185 default: 11186 PyErr_BadArgument(); 11187 return NULL; 11188 } 11189 } 11190 else if (op == Py_EQ || op == Py_NE) { 11191 result = unicode_compare_eq(left, right); 11192 result ^= (op == Py_NE); 11193 v = TEST_COND(result); 11194 } 11195 else { 11196 result = unicode_compare(left, right); 11197 11198 /* Convert the return value to a Boolean */ 11199 switch (op) { 11200 case Py_LE: 11201 v = TEST_COND(result <= 0); 11202 break; 11203 case Py_GE: 11204 v = TEST_COND(result >= 0); 11205 break; 11206 case Py_LT: 11207 v = TEST_COND(result == -1); 11208 break; 11209 case Py_GT: 11210 v = TEST_COND(result == 1); 11211 break; 11212 default: 11213 PyErr_BadArgument(); 11214 return NULL; 11215 } 11216 } 11217 Py_INCREF(v); 11218 return v; 11219 } 11220 11221 int 11222 _PyUnicode_EQ(PyObject *aa, PyObject *bb) 11223 { 11224 return unicode_eq(aa, bb); 11225 } 11226 11227 int 11228 PyUnicode_Contains(PyObject *str, PyObject *substr) 11229 { 11230 int kind1, kind2; 11231 void *buf1, *buf2; 11232 Py_ssize_t len1, len2; 11233 int result; 11234 11235 if (!PyUnicode_Check(substr)) { 11236 PyErr_Format(PyExc_TypeError, 11237 "'in <string>' requires string as left operand, not %.100s", 11238 Py_TYPE(substr)->tp_name); 11239 return -1; 11240 } 11241 if (PyUnicode_READY(substr) == -1) 11242 return -1; 11243 if (ensure_unicode(str) < 0) 11244 return -1; 11245 11246 kind1 = PyUnicode_KIND(str); 11247 kind2 = PyUnicode_KIND(substr); 11248 if (kind1 < kind2) 11249 return 0; 11250 len1 = PyUnicode_GET_LENGTH(str); 11251 len2 = PyUnicode_GET_LENGTH(substr); 11252 if (len1 < len2) 11253 return 0; 11254 buf1 = PyUnicode_DATA(str); 11255 buf2 = PyUnicode_DATA(substr); 11256 if (len2 == 1) { 11257 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 11258 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 11259 return result; 11260 } 11261 if (kind2 != kind1) { 11262 buf2 = _PyUnicode_AsKind(substr, kind1); 11263 if (!buf2) 11264 return -1; 11265 } 11266 11267 switch (kind1) { 11268 case PyUnicode_1BYTE_KIND: 11269 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11270 break; 11271 case PyUnicode_2BYTE_KIND: 11272 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11273 break; 11274 case PyUnicode_4BYTE_KIND: 11275 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11276 break; 11277 default: 11278 result = -1; 11279 assert(0); 11280 } 11281 11282 if (kind2 != kind1) 11283 PyMem_Free(buf2); 11284 11285 return result; 11286 } 11287 11288 /* Concat to string or Unicode object giving a new Unicode object. */ 11289 11290 PyObject * 11291 PyUnicode_Concat(PyObject *left, PyObject *right) 11292 { 11293 PyObject *result; 11294 Py_UCS4 maxchar, maxchar2; 11295 Py_ssize_t left_len, right_len, new_len; 11296 11297 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0) 11298 return NULL; 11299 11300 /* Shortcuts */ 11301 if (left == unicode_empty) 11302 return PyUnicode_FromObject(right); 11303 if (right == unicode_empty) 11304 return PyUnicode_FromObject(left); 11305 11306 left_len = PyUnicode_GET_LENGTH(left); 11307 right_len = PyUnicode_GET_LENGTH(right); 11308 if (left_len > PY_SSIZE_T_MAX - right_len) { 11309 PyErr_SetString(PyExc_OverflowError, 11310 "strings are too large to concat"); 11311 return NULL; 11312 } 11313 new_len = left_len + right_len; 11314 11315 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11316 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11317 maxchar = Py_MAX(maxchar, maxchar2); 11318 11319 /* Concat the two Unicode strings */ 11320 result = PyUnicode_New(new_len, maxchar); 11321 if (result == NULL) 11322 return NULL; 11323 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); 11324 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); 11325 assert(_PyUnicode_CheckConsistency(result, 1)); 11326 return result; 11327 } 11328 11329 void 11330 PyUnicode_Append(PyObject **p_left, PyObject *right) 11331 { 11332 PyObject *left, *res; 11333 Py_UCS4 maxchar, maxchar2; 11334 Py_ssize_t left_len, right_len, new_len; 11335 11336 if (p_left == NULL) { 11337 if (!PyErr_Occurred()) 11338 PyErr_BadInternalCall(); 11339 return; 11340 } 11341 left = *p_left; 11342 if (right == NULL || left == NULL 11343 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11344 if (!PyErr_Occurred()) 11345 PyErr_BadInternalCall(); 11346 goto error; 11347 } 11348 11349 if (PyUnicode_READY(left) == -1) 11350 goto error; 11351 if (PyUnicode_READY(right) == -1) 11352 goto error; 11353 11354 /* Shortcuts */ 11355 if (left == unicode_empty) { 11356 Py_DECREF(left); 11357 Py_INCREF(right); 11358 *p_left = right; 11359 return; 11360 } 11361 if (right == unicode_empty) 11362 return; 11363 11364 left_len = PyUnicode_GET_LENGTH(left); 11365 right_len = PyUnicode_GET_LENGTH(right); 11366 if (left_len > PY_SSIZE_T_MAX - right_len) { 11367 PyErr_SetString(PyExc_OverflowError, 11368 "strings are too large to concat"); 11369 goto error; 11370 } 11371 new_len = left_len + right_len; 11372 11373 if (unicode_modifiable(left) 11374 && PyUnicode_CheckExact(right) 11375 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11376 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11377 to change the structure size, but characters are stored just after 11378 the structure, and so it requires to move all characters which is 11379 not so different than duplicating the string. */ 11380 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11381 { 11382 /* append inplace */ 11383 if (unicode_resize(p_left, new_len) != 0) 11384 goto error; 11385 11386 /* copy 'right' into the newly allocated area of 'left' */ 11387 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11388 } 11389 else { 11390 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11391 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11392 maxchar = Py_MAX(maxchar, maxchar2); 11393 11394 /* Concat the two Unicode strings */ 11395 res = PyUnicode_New(new_len, maxchar); 11396 if (res == NULL) 11397 goto error; 11398 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11399 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11400 Py_DECREF(left); 11401 *p_left = res; 11402 } 11403 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11404 return; 11405 11406 error: 11407 Py_CLEAR(*p_left); 11408 } 11409 11410 void 11411 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11412 { 11413 PyUnicode_Append(pleft, right); 11414 Py_XDECREF(right); 11415 } 11416 11417 /* 11418 Wraps stringlib_parse_args_finds() and additionally ensures that the 11419 first argument is a unicode object. 11420 */ 11421 11422 static inline int 11423 parse_args_finds_unicode(const char * function_name, PyObject *args, 11424 PyObject **substring, 11425 Py_ssize_t *start, Py_ssize_t *end) 11426 { 11427 if(stringlib_parse_args_finds(function_name, args, substring, 11428 start, end)) { 11429 if (ensure_unicode(*substring) < 0) 11430 return 0; 11431 return 1; 11432 } 11433 return 0; 11434 } 11435 11436 PyDoc_STRVAR(count__doc__, 11437 "S.count(sub[, start[, end]]) -> int\n\ 11438 \n\ 11439 Return the number of non-overlapping occurrences of substring sub in\n\ 11440 string S[start:end]. Optional arguments start and end are\n\ 11441 interpreted as in slice notation."); 11442 11443 static PyObject * 11444 unicode_count(PyObject *self, PyObject *args) 11445 { 11446 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11447 Py_ssize_t start = 0; 11448 Py_ssize_t end = PY_SSIZE_T_MAX; 11449 PyObject *result; 11450 int kind1, kind2; 11451 void *buf1, *buf2; 11452 Py_ssize_t len1, len2, iresult; 11453 11454 if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 11455 return NULL; 11456 11457 kind1 = PyUnicode_KIND(self); 11458 kind2 = PyUnicode_KIND(substring); 11459 if (kind1 < kind2) 11460 return PyLong_FromLong(0); 11461 11462 len1 = PyUnicode_GET_LENGTH(self); 11463 len2 = PyUnicode_GET_LENGTH(substring); 11464 ADJUST_INDICES(start, end, len1); 11465 if (end - start < len2) 11466 return PyLong_FromLong(0); 11467 11468 buf1 = PyUnicode_DATA(self); 11469 buf2 = PyUnicode_DATA(substring); 11470 if (kind2 != kind1) { 11471 buf2 = _PyUnicode_AsKind(substring, kind1); 11472 if (!buf2) 11473 return NULL; 11474 } 11475 switch (kind1) { 11476 case PyUnicode_1BYTE_KIND: 11477 iresult = ucs1lib_count( 11478 ((Py_UCS1*)buf1) + start, end - start, 11479 buf2, len2, PY_SSIZE_T_MAX 11480 ); 11481 break; 11482 case PyUnicode_2BYTE_KIND: 11483 iresult = ucs2lib_count( 11484 ((Py_UCS2*)buf1) + start, end - start, 11485 buf2, len2, PY_SSIZE_T_MAX 11486 ); 11487 break; 11488 case PyUnicode_4BYTE_KIND: 11489 iresult = ucs4lib_count( 11490 ((Py_UCS4*)buf1) + start, end - start, 11491 buf2, len2, PY_SSIZE_T_MAX 11492 ); 11493 break; 11494 default: 11495 assert(0); iresult = 0; 11496 } 11497 11498 result = PyLong_FromSsize_t(iresult); 11499 11500 if (kind2 != kind1) 11501 PyMem_Free(buf2); 11502 11503 return result; 11504 } 11505 11506 PyDoc_STRVAR(encode__doc__, 11507 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11508 \n\ 11509 Encode S using the codec registered for encoding. Default encoding\n\ 11510 is 'utf-8'. errors may be given to set a different error\n\ 11511 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11512 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11513 'xmlcharrefreplace' as well as any other name registered with\n\ 11514 codecs.register_error that can handle UnicodeEncodeErrors."); 11515 11516 static PyObject * 11517 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11518 { 11519 static char *kwlist[] = {"encoding", "errors", 0}; 11520 char *encoding = NULL; 11521 char *errors = NULL; 11522 11523 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11524 kwlist, &encoding, &errors)) 11525 return NULL; 11526 return PyUnicode_AsEncodedString(self, encoding, errors); 11527 } 11528 11529 PyDoc_STRVAR(expandtabs__doc__, 11530 "S.expandtabs(tabsize=8) -> str\n\ 11531 \n\ 11532 Return a copy of S where all tab characters are expanded using spaces.\n\ 11533 If tabsize is not given, a tab size of 8 characters is assumed."); 11534 11535 static PyObject* 11536 unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11537 { 11538 Py_ssize_t i, j, line_pos, src_len, incr; 11539 Py_UCS4 ch; 11540 PyObject *u; 11541 void *src_data, *dest_data; 11542 static char *kwlist[] = {"tabsize", 0}; 11543 int tabsize = 8; 11544 int kind; 11545 int found; 11546 11547 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11548 kwlist, &tabsize)) 11549 return NULL; 11550 11551 if (PyUnicode_READY(self) == -1) 11552 return NULL; 11553 11554 /* First pass: determine size of output string */ 11555 src_len = PyUnicode_GET_LENGTH(self); 11556 i = j = line_pos = 0; 11557 kind = PyUnicode_KIND(self); 11558 src_data = PyUnicode_DATA(self); 11559 found = 0; 11560 for (; i < src_len; i++) { 11561 ch = PyUnicode_READ(kind, src_data, i); 11562 if (ch == '\t') { 11563 found = 1; 11564 if (tabsize > 0) { 11565 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11566 if (j > PY_SSIZE_T_MAX - incr) 11567 goto overflow; 11568 line_pos += incr; 11569 j += incr; 11570 } 11571 } 11572 else { 11573 if (j > PY_SSIZE_T_MAX - 1) 11574 goto overflow; 11575 line_pos++; 11576 j++; 11577 if (ch == '\n' || ch == '\r') 11578 line_pos = 0; 11579 } 11580 } 11581 if (!found) 11582 return unicode_result_unchanged(self); 11583 11584 /* Second pass: create output string and fill it */ 11585 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11586 if (!u) 11587 return NULL; 11588 dest_data = PyUnicode_DATA(u); 11589 11590 i = j = line_pos = 0; 11591 11592 for (; i < src_len; i++) { 11593 ch = PyUnicode_READ(kind, src_data, i); 11594 if (ch == '\t') { 11595 if (tabsize > 0) { 11596 incr = tabsize - (line_pos % tabsize); 11597 line_pos += incr; 11598 FILL(kind, dest_data, ' ', j, incr); 11599 j += incr; 11600 } 11601 } 11602 else { 11603 line_pos++; 11604 PyUnicode_WRITE(kind, dest_data, j, ch); 11605 j++; 11606 if (ch == '\n' || ch == '\r') 11607 line_pos = 0; 11608 } 11609 } 11610 assert (j == PyUnicode_GET_LENGTH(u)); 11611 return unicode_result(u); 11612 11613 overflow: 11614 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11615 return NULL; 11616 } 11617 11618 PyDoc_STRVAR(find__doc__, 11619 "S.find(sub[, start[, end]]) -> int\n\ 11620 \n\ 11621 Return the lowest index in S where substring sub is found,\n\ 11622 such that sub is contained within S[start:end]. Optional\n\ 11623 arguments start and end are interpreted as in slice notation.\n\ 11624 \n\ 11625 Return -1 on failure."); 11626 11627 static PyObject * 11628 unicode_find(PyObject *self, PyObject *args) 11629 { 11630 /* initialize variables to prevent gcc warning */ 11631 PyObject *substring = NULL; 11632 Py_ssize_t start = 0; 11633 Py_ssize_t end = 0; 11634 Py_ssize_t result; 11635 11636 if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) 11637 return NULL; 11638 11639 if (PyUnicode_READY(self) == -1) 11640 return NULL; 11641 11642 result = any_find_slice(self, substring, start, end, 1); 11643 11644 if (result == -2) 11645 return NULL; 11646 11647 return PyLong_FromSsize_t(result); 11648 } 11649 11650 static PyObject * 11651 unicode_getitem(PyObject *self, Py_ssize_t index) 11652 { 11653 void *data; 11654 enum PyUnicode_Kind kind; 11655 Py_UCS4 ch; 11656 11657 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11658 PyErr_BadArgument(); 11659 return NULL; 11660 } 11661 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11662 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11663 return NULL; 11664 } 11665 kind = PyUnicode_KIND(self); 11666 data = PyUnicode_DATA(self); 11667 ch = PyUnicode_READ(kind, data, index); 11668 return unicode_char(ch); 11669 } 11670 11671 /* Believe it or not, this produces the same value for ASCII strings 11672 as bytes_hash(). */ 11673 static Py_hash_t 11674 unicode_hash(PyObject *self) 11675 { 11676 Py_ssize_t len; 11677 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11678 11679 #ifdef Py_DEBUG 11680 assert(_Py_HashSecret_Initialized); 11681 #endif 11682 if (_PyUnicode_HASH(self) != -1) 11683 return _PyUnicode_HASH(self); 11684 if (PyUnicode_READY(self) == -1) 11685 return -1; 11686 len = PyUnicode_GET_LENGTH(self); 11687 /* 11688 We make the hash of the empty string be 0, rather than using 11689 (prefix ^ suffix), since this slightly obfuscates the hash secret 11690 */ 11691 if (len == 0) { 11692 _PyUnicode_HASH(self) = 0; 11693 return 0; 11694 } 11695 x = _Py_HashBytes(PyUnicode_DATA(self), 11696 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11697 _PyUnicode_HASH(self) = x; 11698 return x; 11699 } 11700 11701 PyDoc_STRVAR(index__doc__, 11702 "S.index(sub[, start[, end]]) -> int\n\ 11703 \n\ 11704 Like S.find() but raise ValueError when the substring is not found."); 11705 11706 static PyObject * 11707 unicode_index(PyObject *self, PyObject *args) 11708 { 11709 /* initialize variables to prevent gcc warning */ 11710 Py_ssize_t result; 11711 PyObject *substring = NULL; 11712 Py_ssize_t start = 0; 11713 Py_ssize_t end = 0; 11714 11715 if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) 11716 return NULL; 11717 11718 if (PyUnicode_READY(self) == -1) 11719 return NULL; 11720 11721 result = any_find_slice(self, substring, start, end, 1); 11722 11723 if (result == -2) 11724 return NULL; 11725 11726 if (result < 0) { 11727 PyErr_SetString(PyExc_ValueError, "substring not found"); 11728 return NULL; 11729 } 11730 11731 return PyLong_FromSsize_t(result); 11732 } 11733 11734 PyDoc_STRVAR(islower__doc__, 11735 "S.islower() -> bool\n\ 11736 \n\ 11737 Return True if all cased characters in S are lowercase and there is\n\ 11738 at least one cased character in S, False otherwise."); 11739 11740 static PyObject* 11741 unicode_islower(PyObject *self) 11742 { 11743 Py_ssize_t i, length; 11744 int kind; 11745 void *data; 11746 int cased; 11747 11748 if (PyUnicode_READY(self) == -1) 11749 return NULL; 11750 length = PyUnicode_GET_LENGTH(self); 11751 kind = PyUnicode_KIND(self); 11752 data = PyUnicode_DATA(self); 11753 11754 /* Shortcut for single character strings */ 11755 if (length == 1) 11756 return PyBool_FromLong( 11757 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11758 11759 /* Special case for empty strings */ 11760 if (length == 0) 11761 return PyBool_FromLong(0); 11762 11763 cased = 0; 11764 for (i = 0; i < length; i++) { 11765 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11766 11767 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11768 return PyBool_FromLong(0); 11769 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11770 cased = 1; 11771 } 11772 return PyBool_FromLong(cased); 11773 } 11774 11775 PyDoc_STRVAR(isupper__doc__, 11776 "S.isupper() -> bool\n\ 11777 \n\ 11778 Return True if all cased characters in S are uppercase and there is\n\ 11779 at least one cased character in S, False otherwise."); 11780 11781 static PyObject* 11782 unicode_isupper(PyObject *self) 11783 { 11784 Py_ssize_t i, length; 11785 int kind; 11786 void *data; 11787 int cased; 11788 11789 if (PyUnicode_READY(self) == -1) 11790 return NULL; 11791 length = PyUnicode_GET_LENGTH(self); 11792 kind = PyUnicode_KIND(self); 11793 data = PyUnicode_DATA(self); 11794 11795 /* Shortcut for single character strings */ 11796 if (length == 1) 11797 return PyBool_FromLong( 11798 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11799 11800 /* Special case for empty strings */ 11801 if (length == 0) 11802 return PyBool_FromLong(0); 11803 11804 cased = 0; 11805 for (i = 0; i < length; i++) { 11806 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11807 11808 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11809 return PyBool_FromLong(0); 11810 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11811 cased = 1; 11812 } 11813 return PyBool_FromLong(cased); 11814 } 11815 11816 PyDoc_STRVAR(istitle__doc__, 11817 "S.istitle() -> bool\n\ 11818 \n\ 11819 Return True if S is a titlecased string and there is at least one\n\ 11820 character in S, i.e. upper- and titlecase characters may only\n\ 11821 follow uncased characters and lowercase characters only cased ones.\n\ 11822 Return False otherwise."); 11823 11824 static PyObject* 11825 unicode_istitle(PyObject *self) 11826 { 11827 Py_ssize_t i, length; 11828 int kind; 11829 void *data; 11830 int cased, previous_is_cased; 11831 11832 if (PyUnicode_READY(self) == -1) 11833 return NULL; 11834 length = PyUnicode_GET_LENGTH(self); 11835 kind = PyUnicode_KIND(self); 11836 data = PyUnicode_DATA(self); 11837 11838 /* Shortcut for single character strings */ 11839 if (length == 1) { 11840 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11841 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11842 (Py_UNICODE_ISUPPER(ch) != 0)); 11843 } 11844 11845 /* Special case for empty strings */ 11846 if (length == 0) 11847 return PyBool_FromLong(0); 11848 11849 cased = 0; 11850 previous_is_cased = 0; 11851 for (i = 0; i < length; i++) { 11852 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11853 11854 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11855 if (previous_is_cased) 11856 return PyBool_FromLong(0); 11857 previous_is_cased = 1; 11858 cased = 1; 11859 } 11860 else if (Py_UNICODE_ISLOWER(ch)) { 11861 if (!previous_is_cased) 11862 return PyBool_FromLong(0); 11863 previous_is_cased = 1; 11864 cased = 1; 11865 } 11866 else 11867 previous_is_cased = 0; 11868 } 11869 return PyBool_FromLong(cased); 11870 } 11871 11872 PyDoc_STRVAR(isspace__doc__, 11873 "S.isspace() -> bool\n\ 11874 \n\ 11875 Return True if all characters in S are whitespace\n\ 11876 and there is at least one character in S, False otherwise."); 11877 11878 static PyObject* 11879 unicode_isspace(PyObject *self) 11880 { 11881 Py_ssize_t i, length; 11882 int kind; 11883 void *data; 11884 11885 if (PyUnicode_READY(self) == -1) 11886 return NULL; 11887 length = PyUnicode_GET_LENGTH(self); 11888 kind = PyUnicode_KIND(self); 11889 data = PyUnicode_DATA(self); 11890 11891 /* Shortcut for single character strings */ 11892 if (length == 1) 11893 return PyBool_FromLong( 11894 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11895 11896 /* Special case for empty strings */ 11897 if (length == 0) 11898 return PyBool_FromLong(0); 11899 11900 for (i = 0; i < length; i++) { 11901 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11902 if (!Py_UNICODE_ISSPACE(ch)) 11903 return PyBool_FromLong(0); 11904 } 11905 return PyBool_FromLong(1); 11906 } 11907 11908 PyDoc_STRVAR(isalpha__doc__, 11909 "S.isalpha() -> bool\n\ 11910 \n\ 11911 Return True if all characters in S are alphabetic\n\ 11912 and there is at least one character in S, False otherwise."); 11913 11914 static PyObject* 11915 unicode_isalpha(PyObject *self) 11916 { 11917 Py_ssize_t i, length; 11918 int kind; 11919 void *data; 11920 11921 if (PyUnicode_READY(self) == -1) 11922 return NULL; 11923 length = PyUnicode_GET_LENGTH(self); 11924 kind = PyUnicode_KIND(self); 11925 data = PyUnicode_DATA(self); 11926 11927 /* Shortcut for single character strings */ 11928 if (length == 1) 11929 return PyBool_FromLong( 11930 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11931 11932 /* Special case for empty strings */ 11933 if (length == 0) 11934 return PyBool_FromLong(0); 11935 11936 for (i = 0; i < length; i++) { 11937 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11938 return PyBool_FromLong(0); 11939 } 11940 return PyBool_FromLong(1); 11941 } 11942 11943 PyDoc_STRVAR(isalnum__doc__, 11944 "S.isalnum() -> bool\n\ 11945 \n\ 11946 Return True if all characters in S are alphanumeric\n\ 11947 and there is at least one character in S, False otherwise."); 11948 11949 static PyObject* 11950 unicode_isalnum(PyObject *self) 11951 { 11952 int kind; 11953 void *data; 11954 Py_ssize_t len, i; 11955 11956 if (PyUnicode_READY(self) == -1) 11957 return NULL; 11958 11959 kind = PyUnicode_KIND(self); 11960 data = PyUnicode_DATA(self); 11961 len = PyUnicode_GET_LENGTH(self); 11962 11963 /* Shortcut for single character strings */ 11964 if (len == 1) { 11965 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11966 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11967 } 11968 11969 /* Special case for empty strings */ 11970 if (len == 0) 11971 return PyBool_FromLong(0); 11972 11973 for (i = 0; i < len; i++) { 11974 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11975 if (!Py_UNICODE_ISALNUM(ch)) 11976 return PyBool_FromLong(0); 11977 } 11978 return PyBool_FromLong(1); 11979 } 11980 11981 PyDoc_STRVAR(isdecimal__doc__, 11982 "S.isdecimal() -> bool\n\ 11983 \n\ 11984 Return True if there are only decimal characters in S,\n\ 11985 False otherwise."); 11986 11987 static PyObject* 11988 unicode_isdecimal(PyObject *self) 11989 { 11990 Py_ssize_t i, length; 11991 int kind; 11992 void *data; 11993 11994 if (PyUnicode_READY(self) == -1) 11995 return NULL; 11996 length = PyUnicode_GET_LENGTH(self); 11997 kind = PyUnicode_KIND(self); 11998 data = PyUnicode_DATA(self); 11999 12000 /* Shortcut for single character strings */ 12001 if (length == 1) 12002 return PyBool_FromLong( 12003 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 12004 12005 /* Special case for empty strings */ 12006 if (length == 0) 12007 return PyBool_FromLong(0); 12008 12009 for (i = 0; i < length; i++) { 12010 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 12011 return PyBool_FromLong(0); 12012 } 12013 return PyBool_FromLong(1); 12014 } 12015 12016 PyDoc_STRVAR(isdigit__doc__, 12017 "S.isdigit() -> bool\n\ 12018 \n\ 12019 Return True if all characters in S are digits\n\ 12020 and there is at least one character in S, False otherwise."); 12021 12022 static PyObject* 12023 unicode_isdigit(PyObject *self) 12024 { 12025 Py_ssize_t i, length; 12026 int kind; 12027 void *data; 12028 12029 if (PyUnicode_READY(self) == -1) 12030 return NULL; 12031 length = PyUnicode_GET_LENGTH(self); 12032 kind = PyUnicode_KIND(self); 12033 data = PyUnicode_DATA(self); 12034 12035 /* Shortcut for single character strings */ 12036 if (length == 1) { 12037 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12038 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 12039 } 12040 12041 /* Special case for empty strings */ 12042 if (length == 0) 12043 return PyBool_FromLong(0); 12044 12045 for (i = 0; i < length; i++) { 12046 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 12047 return PyBool_FromLong(0); 12048 } 12049 return PyBool_FromLong(1); 12050 } 12051 12052 PyDoc_STRVAR(isnumeric__doc__, 12053 "S.isnumeric() -> bool\n\ 12054 \n\ 12055 Return True if there are only numeric characters in S,\n\ 12056 False otherwise."); 12057 12058 static PyObject* 12059 unicode_isnumeric(PyObject *self) 12060 { 12061 Py_ssize_t i, length; 12062 int kind; 12063 void *data; 12064 12065 if (PyUnicode_READY(self) == -1) 12066 return NULL; 12067 length = PyUnicode_GET_LENGTH(self); 12068 kind = PyUnicode_KIND(self); 12069 data = PyUnicode_DATA(self); 12070 12071 /* Shortcut for single character strings */ 12072 if (length == 1) 12073 return PyBool_FromLong( 12074 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 12075 12076 /* Special case for empty strings */ 12077 if (length == 0) 12078 return PyBool_FromLong(0); 12079 12080 for (i = 0; i < length; i++) { 12081 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 12082 return PyBool_FromLong(0); 12083 } 12084 return PyBool_FromLong(1); 12085 } 12086 12087 int 12088 PyUnicode_IsIdentifier(PyObject *self) 12089 { 12090 int kind; 12091 void *data; 12092 Py_ssize_t i; 12093 Py_UCS4 first; 12094 12095 if (PyUnicode_READY(self) == -1) { 12096 Py_FatalError("identifier not ready"); 12097 return 0; 12098 } 12099 12100 /* Special case for empty strings */ 12101 if (PyUnicode_GET_LENGTH(self) == 0) 12102 return 0; 12103 kind = PyUnicode_KIND(self); 12104 data = PyUnicode_DATA(self); 12105 12106 /* PEP 3131 says that the first character must be in 12107 XID_Start and subsequent characters in XID_Continue, 12108 and for the ASCII range, the 2.x rules apply (i.e 12109 start with letters and underscore, continue with 12110 letters, digits, underscore). However, given the current 12111 definition of XID_Start and XID_Continue, it is sufficient 12112 to check just for these, except that _ must be allowed 12113 as starting an identifier. */ 12114 first = PyUnicode_READ(kind, data, 0); 12115 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 12116 return 0; 12117 12118 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 12119 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 12120 return 0; 12121 return 1; 12122 } 12123 12124 PyDoc_STRVAR(isidentifier__doc__, 12125 "S.isidentifier() -> bool\n\ 12126 \n\ 12127 Return True if S is a valid identifier according\n\ 12128 to the language definition.\n\ 12129 \n\ 12130 Use keyword.iskeyword() to test for reserved identifiers\n\ 12131 such as \"def\" and \"class\".\n"); 12132 12133 static PyObject* 12134 unicode_isidentifier(PyObject *self) 12135 { 12136 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 12137 } 12138 12139 PyDoc_STRVAR(isprintable__doc__, 12140 "S.isprintable() -> bool\n\ 12141 \n\ 12142 Return True if all characters in S are considered\n\ 12143 printable in repr() or S is empty, False otherwise."); 12144 12145 static PyObject* 12146 unicode_isprintable(PyObject *self) 12147 { 12148 Py_ssize_t i, length; 12149 int kind; 12150 void *data; 12151 12152 if (PyUnicode_READY(self) == -1) 12153 return NULL; 12154 length = PyUnicode_GET_LENGTH(self); 12155 kind = PyUnicode_KIND(self); 12156 data = PyUnicode_DATA(self); 12157 12158 /* Shortcut for single character strings */ 12159 if (length == 1) 12160 return PyBool_FromLong( 12161 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 12162 12163 for (i = 0; i < length; i++) { 12164 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 12165 Py_RETURN_FALSE; 12166 } 12167 } 12168 Py_RETURN_TRUE; 12169 } 12170 12171 PyDoc_STRVAR(join__doc__, 12172 "S.join(iterable) -> str\n\ 12173 \n\ 12174 Return a string which is the concatenation of the strings in the\n\ 12175 iterable. The separator between elements is S."); 12176 12177 static PyObject* 12178 unicode_join(PyObject *self, PyObject *data) 12179 { 12180 return PyUnicode_Join(self, data); 12181 } 12182 12183 static Py_ssize_t 12184 unicode_length(PyObject *self) 12185 { 12186 if (PyUnicode_READY(self) == -1) 12187 return -1; 12188 return PyUnicode_GET_LENGTH(self); 12189 } 12190 12191 PyDoc_STRVAR(ljust__doc__, 12192 "S.ljust(width[, fillchar]) -> str\n\ 12193 \n\ 12194 Return S left-justified in a Unicode string of length width. Padding is\n\ 12195 done using the specified fill character (default is a space)."); 12196 12197 static PyObject * 12198 unicode_ljust(PyObject *self, PyObject *args) 12199 { 12200 Py_ssize_t width; 12201 Py_UCS4 fillchar = ' '; 12202 12203 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 12204 return NULL; 12205 12206 if (PyUnicode_READY(self) == -1) 12207 return NULL; 12208 12209 if (PyUnicode_GET_LENGTH(self) >= width) 12210 return unicode_result_unchanged(self); 12211 12212 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 12213 } 12214 12215 PyDoc_STRVAR(lower__doc__, 12216 "S.lower() -> str\n\ 12217 \n\ 12218 Return a copy of the string S converted to lowercase."); 12219 12220 static PyObject* 12221 unicode_lower(PyObject *self) 12222 { 12223 if (PyUnicode_READY(self) == -1) 12224 return NULL; 12225 if (PyUnicode_IS_ASCII(self)) 12226 return ascii_upper_or_lower(self, 1); 12227 return case_operation(self, do_lower); 12228 } 12229 12230 #define LEFTSTRIP 0 12231 #define RIGHTSTRIP 1 12232 #define BOTHSTRIP 2 12233 12234 /* Arrays indexed by above */ 12235 static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 12236 12237 #define STRIPNAME(i) (stripformat[i]+3) 12238 12239 /* externally visible for str.strip(unicode) */ 12240 PyObject * 12241 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 12242 { 12243 void *data; 12244 int kind; 12245 Py_ssize_t i, j, len; 12246 BLOOM_MASK sepmask; 12247 Py_ssize_t seplen; 12248 12249 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12250 return NULL; 12251 12252 kind = PyUnicode_KIND(self); 12253 data = PyUnicode_DATA(self); 12254 len = PyUnicode_GET_LENGTH(self); 12255 seplen = PyUnicode_GET_LENGTH(sepobj); 12256 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12257 PyUnicode_DATA(sepobj), 12258 seplen); 12259 12260 i = 0; 12261 if (striptype != RIGHTSTRIP) { 12262 while (i < len) { 12263 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12264 if (!BLOOM(sepmask, ch)) 12265 break; 12266 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12267 break; 12268 i++; 12269 } 12270 } 12271 12272 j = len; 12273 if (striptype != LEFTSTRIP) { 12274 j--; 12275 while (j >= i) { 12276 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12277 if (!BLOOM(sepmask, ch)) 12278 break; 12279 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12280 break; 12281 j--; 12282 } 12283 12284 j++; 12285 } 12286 12287 return PyUnicode_Substring(self, i, j); 12288 } 12289 12290 PyObject* 12291 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12292 { 12293 unsigned char *data; 12294 int kind; 12295 Py_ssize_t length; 12296 12297 if (PyUnicode_READY(self) == -1) 12298 return NULL; 12299 12300 length = PyUnicode_GET_LENGTH(self); 12301 end = Py_MIN(end, length); 12302 12303 if (start == 0 && end == length) 12304 return unicode_result_unchanged(self); 12305 12306 if (start < 0 || end < 0) { 12307 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12308 return NULL; 12309 } 12310 if (start >= length || end < start) 12311 _Py_RETURN_UNICODE_EMPTY(); 12312 12313 length = end - start; 12314 if (PyUnicode_IS_ASCII(self)) { 12315 data = PyUnicode_1BYTE_DATA(self); 12316 return _PyUnicode_FromASCII((char*)(data + start), length); 12317 } 12318 else { 12319 kind = PyUnicode_KIND(self); 12320 data = PyUnicode_1BYTE_DATA(self); 12321 return PyUnicode_FromKindAndData(kind, 12322 data + kind * start, 12323 length); 12324 } 12325 } 12326 12327 static PyObject * 12328 do_strip(PyObject *self, int striptype) 12329 { 12330 Py_ssize_t len, i, j; 12331 12332 if (PyUnicode_READY(self) == -1) 12333 return NULL; 12334 12335 len = PyUnicode_GET_LENGTH(self); 12336 12337 if (PyUnicode_IS_ASCII(self)) { 12338 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12339 12340 i = 0; 12341 if (striptype != RIGHTSTRIP) { 12342 while (i < len) { 12343 Py_UCS1 ch = data[i]; 12344 if (!_Py_ascii_whitespace[ch]) 12345 break; 12346 i++; 12347 } 12348 } 12349 12350 j = len; 12351 if (striptype != LEFTSTRIP) { 12352 j--; 12353 while (j >= i) { 12354 Py_UCS1 ch = data[j]; 12355 if (!_Py_ascii_whitespace[ch]) 12356 break; 12357 j--; 12358 } 12359 j++; 12360 } 12361 } 12362 else { 12363 int kind = PyUnicode_KIND(self); 12364 void *data = PyUnicode_DATA(self); 12365 12366 i = 0; 12367 if (striptype != RIGHTSTRIP) { 12368 while (i < len) { 12369 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12370 if (!Py_UNICODE_ISSPACE(ch)) 12371 break; 12372 i++; 12373 } 12374 } 12375 12376 j = len; 12377 if (striptype != LEFTSTRIP) { 12378 j--; 12379 while (j >= i) { 12380 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12381 if (!Py_UNICODE_ISSPACE(ch)) 12382 break; 12383 j--; 12384 } 12385 j++; 12386 } 12387 } 12388 12389 return PyUnicode_Substring(self, i, j); 12390 } 12391 12392 12393 static PyObject * 12394 do_argstrip(PyObject *self, int striptype, PyObject *args) 12395 { 12396 PyObject *sep = NULL; 12397 12398 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12399 return NULL; 12400 12401 if (sep != NULL && sep != Py_None) { 12402 if (PyUnicode_Check(sep)) 12403 return _PyUnicode_XStrip(self, striptype, sep); 12404 else { 12405 PyErr_Format(PyExc_TypeError, 12406 "%s arg must be None or str", 12407 STRIPNAME(striptype)); 12408 return NULL; 12409 } 12410 } 12411 12412 return do_strip(self, striptype); 12413 } 12414 12415 12416 PyDoc_STRVAR(strip__doc__, 12417 "S.strip([chars]) -> str\n\ 12418 \n\ 12419 Return a copy of the string S with leading and trailing\n\ 12420 whitespace removed.\n\ 12421 If chars is given and not None, remove characters in chars instead."); 12422 12423 static PyObject * 12424 unicode_strip(PyObject *self, PyObject *args) 12425 { 12426 if (PyTuple_GET_SIZE(args) == 0) 12427 return do_strip(self, BOTHSTRIP); /* Common case */ 12428 else 12429 return do_argstrip(self, BOTHSTRIP, args); 12430 } 12431 12432 12433 PyDoc_STRVAR(lstrip__doc__, 12434 "S.lstrip([chars]) -> str\n\ 12435 \n\ 12436 Return a copy of the string S with leading whitespace removed.\n\ 12437 If chars is given and not None, remove characters in chars instead."); 12438 12439 static PyObject * 12440 unicode_lstrip(PyObject *self, PyObject *args) 12441 { 12442 if (PyTuple_GET_SIZE(args) == 0) 12443 return do_strip(self, LEFTSTRIP); /* Common case */ 12444 else 12445 return do_argstrip(self, LEFTSTRIP, args); 12446 } 12447 12448 12449 PyDoc_STRVAR(rstrip__doc__, 12450 "S.rstrip([chars]) -> str\n\ 12451 \n\ 12452 Return a copy of the string S with trailing whitespace removed.\n\ 12453 If chars is given and not None, remove characters in chars instead."); 12454 12455 static PyObject * 12456 unicode_rstrip(PyObject *self, PyObject *args) 12457 { 12458 if (PyTuple_GET_SIZE(args) == 0) 12459 return do_strip(self, RIGHTSTRIP); /* Common case */ 12460 else 12461 return do_argstrip(self, RIGHTSTRIP, args); 12462 } 12463 12464 12465 static PyObject* 12466 unicode_repeat(PyObject *str, Py_ssize_t len) 12467 { 12468 PyObject *u; 12469 Py_ssize_t nchars, n; 12470 12471 if (len < 1) 12472 _Py_RETURN_UNICODE_EMPTY(); 12473 12474 /* no repeat, return original string */ 12475 if (len == 1) 12476 return unicode_result_unchanged(str); 12477 12478 if (PyUnicode_READY(str) == -1) 12479 return NULL; 12480 12481 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12482 PyErr_SetString(PyExc_OverflowError, 12483 "repeated string is too long"); 12484 return NULL; 12485 } 12486 nchars = len * PyUnicode_GET_LENGTH(str); 12487 12488 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12489 if (!u) 12490 return NULL; 12491 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12492 12493 if (PyUnicode_GET_LENGTH(str) == 1) { 12494 const int kind = PyUnicode_KIND(str); 12495 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12496 if (kind == PyUnicode_1BYTE_KIND) { 12497 void *to = PyUnicode_DATA(u); 12498 memset(to, (unsigned char)fill_char, len); 12499 } 12500 else if (kind == PyUnicode_2BYTE_KIND) { 12501 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12502 for (n = 0; n < len; ++n) 12503 ucs2[n] = fill_char; 12504 } else { 12505 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12506 assert(kind == PyUnicode_4BYTE_KIND); 12507 for (n = 0; n < len; ++n) 12508 ucs4[n] = fill_char; 12509 } 12510 } 12511 else { 12512 /* number of characters copied this far */ 12513 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12514 const Py_ssize_t char_size = PyUnicode_KIND(str); 12515 char *to = (char *) PyUnicode_DATA(u); 12516 memcpy(to, PyUnicode_DATA(str), 12517 PyUnicode_GET_LENGTH(str) * char_size); 12518 while (done < nchars) { 12519 n = (done <= nchars-done) ? done : nchars-done; 12520 memcpy(to + (done * char_size), to, n * char_size); 12521 done += n; 12522 } 12523 } 12524 12525 assert(_PyUnicode_CheckConsistency(u, 1)); 12526 return u; 12527 } 12528 12529 PyObject * 12530 PyUnicode_Replace(PyObject *str, 12531 PyObject *substr, 12532 PyObject *replstr, 12533 Py_ssize_t maxcount) 12534 { 12535 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || 12536 ensure_unicode(replstr) < 0) 12537 return NULL; 12538 return replace(str, substr, replstr, maxcount); 12539 } 12540 12541 PyDoc_STRVAR(replace__doc__, 12542 "S.replace(old, new[, count]) -> str\n\ 12543 \n\ 12544 Return a copy of S with all occurrences of substring\n\ 12545 old replaced by new. If the optional argument count is\n\ 12546 given, only the first count occurrences are replaced."); 12547 12548 static PyObject* 12549 unicode_replace(PyObject *self, PyObject *args) 12550 { 12551 PyObject *str1; 12552 PyObject *str2; 12553 Py_ssize_t maxcount = -1; 12554 12555 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount)) 12556 return NULL; 12557 if (PyUnicode_READY(self) == -1) 12558 return NULL; 12559 return replace(self, str1, str2, maxcount); 12560 } 12561 12562 static PyObject * 12563 unicode_repr(PyObject *unicode) 12564 { 12565 PyObject *repr; 12566 Py_ssize_t isize; 12567 Py_ssize_t osize, squote, dquote, i, o; 12568 Py_UCS4 max, quote; 12569 int ikind, okind, unchanged; 12570 void *idata, *odata; 12571 12572 if (PyUnicode_READY(unicode) == -1) 12573 return NULL; 12574 12575 isize = PyUnicode_GET_LENGTH(unicode); 12576 idata = PyUnicode_DATA(unicode); 12577 12578 /* Compute length of output, quote characters, and 12579 maximum character */ 12580 osize = 0; 12581 max = 127; 12582 squote = dquote = 0; 12583 ikind = PyUnicode_KIND(unicode); 12584 for (i = 0; i < isize; i++) { 12585 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12586 Py_ssize_t incr = 1; 12587 switch (ch) { 12588 case '\'': squote++; break; 12589 case '"': dquote++; break; 12590 case '\\': case '\t': case '\r': case '\n': 12591 incr = 2; 12592 break; 12593 default: 12594 /* Fast-path ASCII */ 12595 if (ch < ' ' || ch == 0x7f) 12596 incr = 4; /* \xHH */ 12597 else if (ch < 0x7f) 12598 ; 12599 else if (Py_UNICODE_ISPRINTABLE(ch)) 12600 max = ch > max ? ch : max; 12601 else if (ch < 0x100) 12602 incr = 4; /* \xHH */ 12603 else if (ch < 0x10000) 12604 incr = 6; /* \uHHHH */ 12605 else 12606 incr = 10; /* \uHHHHHHHH */ 12607 } 12608 if (osize > PY_SSIZE_T_MAX - incr) { 12609 PyErr_SetString(PyExc_OverflowError, 12610 "string is too long to generate repr"); 12611 return NULL; 12612 } 12613 osize += incr; 12614 } 12615 12616 quote = '\''; 12617 unchanged = (osize == isize); 12618 if (squote) { 12619 unchanged = 0; 12620 if (dquote) 12621 /* Both squote and dquote present. Use squote, 12622 and escape them */ 12623 osize += squote; 12624 else 12625 quote = '"'; 12626 } 12627 osize += 2; /* quotes */ 12628 12629 repr = PyUnicode_New(osize, max); 12630 if (repr == NULL) 12631 return NULL; 12632 okind = PyUnicode_KIND(repr); 12633 odata = PyUnicode_DATA(repr); 12634 12635 PyUnicode_WRITE(okind, odata, 0, quote); 12636 PyUnicode_WRITE(okind, odata, osize-1, quote); 12637 if (unchanged) { 12638 _PyUnicode_FastCopyCharacters(repr, 1, 12639 unicode, 0, 12640 isize); 12641 } 12642 else { 12643 for (i = 0, o = 1; i < isize; i++) { 12644 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12645 12646 /* Escape quotes and backslashes */ 12647 if ((ch == quote) || (ch == '\\')) { 12648 PyUnicode_WRITE(okind, odata, o++, '\\'); 12649 PyUnicode_WRITE(okind, odata, o++, ch); 12650 continue; 12651 } 12652 12653 /* Map special whitespace to '\t', \n', '\r' */ 12654 if (ch == '\t') { 12655 PyUnicode_WRITE(okind, odata, o++, '\\'); 12656 PyUnicode_WRITE(okind, odata, o++, 't'); 12657 } 12658 else if (ch == '\n') { 12659 PyUnicode_WRITE(okind, odata, o++, '\\'); 12660 PyUnicode_WRITE(okind, odata, o++, 'n'); 12661 } 12662 else if (ch == '\r') { 12663 PyUnicode_WRITE(okind, odata, o++, '\\'); 12664 PyUnicode_WRITE(okind, odata, o++, 'r'); 12665 } 12666 12667 /* Map non-printable US ASCII to '\xhh' */ 12668 else if (ch < ' ' || ch == 0x7F) { 12669 PyUnicode_WRITE(okind, odata, o++, '\\'); 12670 PyUnicode_WRITE(okind, odata, o++, 'x'); 12671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12672 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12673 } 12674 12675 /* Copy ASCII characters as-is */ 12676 else if (ch < 0x7F) { 12677 PyUnicode_WRITE(okind, odata, o++, ch); 12678 } 12679 12680 /* Non-ASCII characters */ 12681 else { 12682 /* Map Unicode whitespace and control characters 12683 (categories Z* and C* except ASCII space) 12684 */ 12685 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12686 PyUnicode_WRITE(okind, odata, o++, '\\'); 12687 /* Map 8-bit characters to '\xhh' */ 12688 if (ch <= 0xff) { 12689 PyUnicode_WRITE(okind, odata, o++, 'x'); 12690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12692 } 12693 /* Map 16-bit characters to '\uxxxx' */ 12694 else if (ch <= 0xffff) { 12695 PyUnicode_WRITE(okind, odata, o++, 'u'); 12696 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12697 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12698 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12699 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12700 } 12701 /* Map 21-bit characters to '\U00xxxxxx' */ 12702 else { 12703 PyUnicode_WRITE(okind, odata, o++, 'U'); 12704 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12705 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12712 } 12713 } 12714 /* Copy characters as-is */ 12715 else { 12716 PyUnicode_WRITE(okind, odata, o++, ch); 12717 } 12718 } 12719 } 12720 } 12721 /* Closing quote already added at the beginning */ 12722 assert(_PyUnicode_CheckConsistency(repr, 1)); 12723 return repr; 12724 } 12725 12726 PyDoc_STRVAR(rfind__doc__, 12727 "S.rfind(sub[, start[, end]]) -> int\n\ 12728 \n\ 12729 Return the highest index in S where substring sub is found,\n\ 12730 such that sub is contained within S[start:end]. Optional\n\ 12731 arguments start and end are interpreted as in slice notation.\n\ 12732 \n\ 12733 Return -1 on failure."); 12734 12735 static PyObject * 12736 unicode_rfind(PyObject *self, PyObject *args) 12737 { 12738 /* initialize variables to prevent gcc warning */ 12739 PyObject *substring = NULL; 12740 Py_ssize_t start = 0; 12741 Py_ssize_t end = 0; 12742 Py_ssize_t result; 12743 12744 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) 12745 return NULL; 12746 12747 if (PyUnicode_READY(self) == -1) 12748 return NULL; 12749 12750 result = any_find_slice(self, substring, start, end, -1); 12751 12752 if (result == -2) 12753 return NULL; 12754 12755 return PyLong_FromSsize_t(result); 12756 } 12757 12758 PyDoc_STRVAR(rindex__doc__, 12759 "S.rindex(sub[, start[, end]]) -> int\n\ 12760 \n\ 12761 Like S.rfind() but raise ValueError when the substring is not found."); 12762 12763 static PyObject * 12764 unicode_rindex(PyObject *self, PyObject *args) 12765 { 12766 /* initialize variables to prevent gcc warning */ 12767 PyObject *substring = NULL; 12768 Py_ssize_t start = 0; 12769 Py_ssize_t end = 0; 12770 Py_ssize_t result; 12771 12772 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) 12773 return NULL; 12774 12775 if (PyUnicode_READY(self) == -1) 12776 return NULL; 12777 12778 result = any_find_slice(self, substring, start, end, -1); 12779 12780 if (result == -2) 12781 return NULL; 12782 12783 if (result < 0) { 12784 PyErr_SetString(PyExc_ValueError, "substring not found"); 12785 return NULL; 12786 } 12787 12788 return PyLong_FromSsize_t(result); 12789 } 12790 12791 PyDoc_STRVAR(rjust__doc__, 12792 "S.rjust(width[, fillchar]) -> str\n\ 12793 \n\ 12794 Return S right-justified in a string of length width. Padding is\n\ 12795 done using the specified fill character (default is a space)."); 12796 12797 static PyObject * 12798 unicode_rjust(PyObject *self, PyObject *args) 12799 { 12800 Py_ssize_t width; 12801 Py_UCS4 fillchar = ' '; 12802 12803 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12804 return NULL; 12805 12806 if (PyUnicode_READY(self) == -1) 12807 return NULL; 12808 12809 if (PyUnicode_GET_LENGTH(self) >= width) 12810 return unicode_result_unchanged(self); 12811 12812 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12813 } 12814 12815 PyObject * 12816 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12817 { 12818 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 12819 return NULL; 12820 12821 return split(s, sep, maxsplit); 12822 } 12823 12824 PyDoc_STRVAR(split__doc__, 12825 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12826 \n\ 12827 Return a list of the words in S, using sep as the\n\ 12828 delimiter string. If maxsplit is given, at most maxsplit\n\ 12829 splits are done. If sep is not specified or is None, any\n\ 12830 whitespace string is a separator and empty strings are\n\ 12831 removed from the result."); 12832 12833 static PyObject* 12834 unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12835 { 12836 static char *kwlist[] = {"sep", "maxsplit", 0}; 12837 PyObject *substring = Py_None; 12838 Py_ssize_t maxcount = -1; 12839 12840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12841 kwlist, &substring, &maxcount)) 12842 return NULL; 12843 12844 if (substring == Py_None) 12845 return split(self, NULL, maxcount); 12846 12847 if (PyUnicode_Check(substring)) 12848 return split(self, substring, maxcount); 12849 12850 PyErr_Format(PyExc_TypeError, 12851 "must be str or None, not %.100s", 12852 Py_TYPE(substring)->tp_name); 12853 return NULL; 12854 } 12855 12856 PyObject * 12857 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) 12858 { 12859 PyObject* out; 12860 int kind1, kind2; 12861 void *buf1, *buf2; 12862 Py_ssize_t len1, len2; 12863 12864 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12865 return NULL; 12866 12867 kind1 = PyUnicode_KIND(str_obj); 12868 kind2 = PyUnicode_KIND(sep_obj); 12869 len1 = PyUnicode_GET_LENGTH(str_obj); 12870 len2 = PyUnicode_GET_LENGTH(sep_obj); 12871 if (kind1 < kind2 || len1 < len2) { 12872 _Py_INCREF_UNICODE_EMPTY(); 12873 if (!unicode_empty) 12874 out = NULL; 12875 else { 12876 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12877 Py_DECREF(unicode_empty); 12878 } 12879 return out; 12880 } 12881 buf1 = PyUnicode_DATA(str_obj); 12882 buf2 = PyUnicode_DATA(sep_obj); 12883 if (kind2 != kind1) { 12884 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12885 if (!buf2) 12886 return NULL; 12887 } 12888 12889 switch (kind1) { 12890 case PyUnicode_1BYTE_KIND: 12891 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12892 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12893 else 12894 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12895 break; 12896 case PyUnicode_2BYTE_KIND: 12897 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12898 break; 12899 case PyUnicode_4BYTE_KIND: 12900 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12901 break; 12902 default: 12903 assert(0); 12904 out = 0; 12905 } 12906 12907 if (kind2 != kind1) 12908 PyMem_Free(buf2); 12909 12910 return out; 12911 } 12912 12913 12914 PyObject * 12915 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) 12916 { 12917 PyObject* out; 12918 int kind1, kind2; 12919 void *buf1, *buf2; 12920 Py_ssize_t len1, len2; 12921 12922 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12923 return NULL; 12924 12925 kind1 = PyUnicode_KIND(str_obj); 12926 kind2 = PyUnicode_KIND(sep_obj); 12927 len1 = PyUnicode_GET_LENGTH(str_obj); 12928 len2 = PyUnicode_GET_LENGTH(sep_obj); 12929 if (kind1 < kind2 || len1 < len2) { 12930 _Py_INCREF_UNICODE_EMPTY(); 12931 if (!unicode_empty) 12932 out = NULL; 12933 else { 12934 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12935 Py_DECREF(unicode_empty); 12936 } 12937 return out; 12938 } 12939 buf1 = PyUnicode_DATA(str_obj); 12940 buf2 = PyUnicode_DATA(sep_obj); 12941 if (kind2 != kind1) { 12942 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12943 if (!buf2) 12944 return NULL; 12945 } 12946 12947 switch (kind1) { 12948 case PyUnicode_1BYTE_KIND: 12949 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12950 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12951 else 12952 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12953 break; 12954 case PyUnicode_2BYTE_KIND: 12955 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12956 break; 12957 case PyUnicode_4BYTE_KIND: 12958 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12959 break; 12960 default: 12961 assert(0); 12962 out = 0; 12963 } 12964 12965 if (kind2 != kind1) 12966 PyMem_Free(buf2); 12967 12968 return out; 12969 } 12970 12971 PyDoc_STRVAR(partition__doc__, 12972 "S.partition(sep) -> (head, sep, tail)\n\ 12973 \n\ 12974 Search for the separator sep in S, and return the part before it,\n\ 12975 the separator itself, and the part after it. If the separator is not\n\ 12976 found, return S and two empty strings."); 12977 12978 static PyObject* 12979 unicode_partition(PyObject *self, PyObject *separator) 12980 { 12981 return PyUnicode_Partition(self, separator); 12982 } 12983 12984 PyDoc_STRVAR(rpartition__doc__, 12985 "S.rpartition(sep) -> (head, sep, tail)\n\ 12986 \n\ 12987 Search for the separator sep in S, starting at the end of S, and return\n\ 12988 the part before it, the separator itself, and the part after it. If the\n\ 12989 separator is not found, return two empty strings and S."); 12990 12991 static PyObject* 12992 unicode_rpartition(PyObject *self, PyObject *separator) 12993 { 12994 return PyUnicode_RPartition(self, separator); 12995 } 12996 12997 PyObject * 12998 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12999 { 13000 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 13001 return NULL; 13002 13003 return rsplit(s, sep, maxsplit); 13004 } 13005 13006 PyDoc_STRVAR(rsplit__doc__, 13007 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 13008 \n\ 13009 Return a list of the words in S, using sep as the\n\ 13010 delimiter string, starting at the end of the string and\n\ 13011 working to the front. If maxsplit is given, at most maxsplit\n\ 13012 splits are done. If sep is not specified, any whitespace string\n\ 13013 is a separator."); 13014 13015 static PyObject* 13016 unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 13017 { 13018 static char *kwlist[] = {"sep", "maxsplit", 0}; 13019 PyObject *substring = Py_None; 13020 Py_ssize_t maxcount = -1; 13021 13022 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 13023 kwlist, &substring, &maxcount)) 13024 return NULL; 13025 13026 if (substring == Py_None) 13027 return rsplit(self, NULL, maxcount); 13028 13029 if (PyUnicode_Check(substring)) 13030 return rsplit(self, substring, maxcount); 13031 13032 PyErr_Format(PyExc_TypeError, 13033 "must be str or None, not %.100s", 13034 Py_TYPE(substring)->tp_name); 13035 return NULL; 13036 } 13037 13038 PyDoc_STRVAR(splitlines__doc__, 13039 "S.splitlines([keepends]) -> list of strings\n\ 13040 \n\ 13041 Return a list of the lines in S, breaking at line boundaries.\n\ 13042 Line breaks are not included in the resulting list unless keepends\n\ 13043 is given and true."); 13044 13045 static PyObject* 13046 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 13047 { 13048 static char *kwlist[] = {"keepends", 0}; 13049 int keepends = 0; 13050 13051 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 13052 kwlist, &keepends)) 13053 return NULL; 13054 13055 return PyUnicode_Splitlines(self, keepends); 13056 } 13057 13058 static 13059 PyObject *unicode_str(PyObject *self) 13060 { 13061 return unicode_result_unchanged(self); 13062 } 13063 13064 PyDoc_STRVAR(swapcase__doc__, 13065 "S.swapcase() -> str\n\ 13066 \n\ 13067 Return a copy of S with uppercase characters converted to lowercase\n\ 13068 and vice versa."); 13069 13070 static PyObject* 13071 unicode_swapcase(PyObject *self) 13072 { 13073 if (PyUnicode_READY(self) == -1) 13074 return NULL; 13075 return case_operation(self, do_swapcase); 13076 } 13077 13078 /*[clinic input] 13079 13080 @staticmethod 13081 str.maketrans as unicode_maketrans 13082 13083 x: object 13084 13085 y: unicode=NULL 13086 13087 z: unicode=NULL 13088 13089 / 13090 13091 Return a translation table usable for str.translate(). 13092 13093 If there is only one argument, it must be a dictionary mapping Unicode 13094 ordinals (integers) or characters to Unicode ordinals, strings or None. 13095 Character keys will be then converted to ordinals. 13096 If there are two arguments, they must be strings of equal length, and 13097 in the resulting dictionary, each character in x will be mapped to the 13098 character at the same position in y. If there is a third argument, it 13099 must be a string, whose characters will be mapped to None in the result. 13100 [clinic start generated code]*/ 13101 13102 static PyObject * 13103 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 13104 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 13105 { 13106 PyObject *new = NULL, *key, *value; 13107 Py_ssize_t i = 0; 13108 int res; 13109 13110 new = PyDict_New(); 13111 if (!new) 13112 return NULL; 13113 if (y != NULL) { 13114 int x_kind, y_kind, z_kind; 13115 void *x_data, *y_data, *z_data; 13116 13117 /* x must be a string too, of equal length */ 13118 if (!PyUnicode_Check(x)) { 13119 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13120 "be a string if there is a second argument"); 13121 goto err; 13122 } 13123 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13124 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13125 "arguments must have equal length"); 13126 goto err; 13127 } 13128 /* create entries for translating chars in x to those in y */ 13129 x_kind = PyUnicode_KIND(x); 13130 y_kind = PyUnicode_KIND(y); 13131 x_data = PyUnicode_DATA(x); 13132 y_data = PyUnicode_DATA(y); 13133 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13134 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13135 if (!key) 13136 goto err; 13137 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13138 if (!value) { 13139 Py_DECREF(key); 13140 goto err; 13141 } 13142 res = PyDict_SetItem(new, key, value); 13143 Py_DECREF(key); 13144 Py_DECREF(value); 13145 if (res < 0) 13146 goto err; 13147 } 13148 /* create entries for deleting chars in z */ 13149 if (z != NULL) { 13150 z_kind = PyUnicode_KIND(z); 13151 z_data = PyUnicode_DATA(z); 13152 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13153 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13154 if (!key) 13155 goto err; 13156 res = PyDict_SetItem(new, key, Py_None); 13157 Py_DECREF(key); 13158 if (res < 0) 13159 goto err; 13160 } 13161 } 13162 } else { 13163 int kind; 13164 void *data; 13165 13166 /* x must be a dict */ 13167 if (!PyDict_CheckExact(x)) { 13168 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13169 "to maketrans it must be a dict"); 13170 goto err; 13171 } 13172 /* copy entries into the new dict, converting string keys to int keys */ 13173 while (PyDict_Next(x, &i, &key, &value)) { 13174 if (PyUnicode_Check(key)) { 13175 /* convert string keys to integer keys */ 13176 PyObject *newkey; 13177 if (PyUnicode_GET_LENGTH(key) != 1) { 13178 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13179 "table must be of length 1"); 13180 goto err; 13181 } 13182 kind = PyUnicode_KIND(key); 13183 data = PyUnicode_DATA(key); 13184 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13185 if (!newkey) 13186 goto err; 13187 res = PyDict_SetItem(new, newkey, value); 13188 Py_DECREF(newkey); 13189 if (res < 0) 13190 goto err; 13191 } else if (PyLong_Check(key)) { 13192 /* just keep integer keys */ 13193 if (PyDict_SetItem(new, key, value) < 0) 13194 goto err; 13195 } else { 13196 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13197 "be strings or integers"); 13198 goto err; 13199 } 13200 } 13201 } 13202 return new; 13203 err: 13204 Py_DECREF(new); 13205 return NULL; 13206 } 13207 13208 PyDoc_STRVAR(translate__doc__, 13209 "S.translate(table) -> str\n\ 13210 \n\ 13211 Return a copy of the string S in which each character has been mapped\n\ 13212 through the given translation table. The table must implement\n\ 13213 lookup/indexing via __getitem__, for instance a dictionary or list,\n\ 13214 mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\ 13215 this operation raises LookupError, the character is left untouched.\n\ 13216 Characters mapped to None are deleted."); 13217 13218 static PyObject* 13219 unicode_translate(PyObject *self, PyObject *table) 13220 { 13221 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13222 } 13223 13224 PyDoc_STRVAR(upper__doc__, 13225 "S.upper() -> str\n\ 13226 \n\ 13227 Return a copy of S converted to uppercase."); 13228 13229 static PyObject* 13230 unicode_upper(PyObject *self) 13231 { 13232 if (PyUnicode_READY(self) == -1) 13233 return NULL; 13234 if (PyUnicode_IS_ASCII(self)) 13235 return ascii_upper_or_lower(self, 0); 13236 return case_operation(self, do_upper); 13237 } 13238 13239 PyDoc_STRVAR(zfill__doc__, 13240 "S.zfill(width) -> str\n\ 13241 \n\ 13242 Pad a numeric string S with zeros on the left, to fill a field\n\ 13243 of the specified width. The string S is never truncated."); 13244 13245 static PyObject * 13246 unicode_zfill(PyObject *self, PyObject *args) 13247 { 13248 Py_ssize_t fill; 13249 PyObject *u; 13250 Py_ssize_t width; 13251 int kind; 13252 void *data; 13253 Py_UCS4 chr; 13254 13255 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13256 return NULL; 13257 13258 if (PyUnicode_READY(self) == -1) 13259 return NULL; 13260 13261 if (PyUnicode_GET_LENGTH(self) >= width) 13262 return unicode_result_unchanged(self); 13263 13264 fill = width - PyUnicode_GET_LENGTH(self); 13265 13266 u = pad(self, fill, 0, '0'); 13267 13268 if (u == NULL) 13269 return NULL; 13270 13271 kind = PyUnicode_KIND(u); 13272 data = PyUnicode_DATA(u); 13273 chr = PyUnicode_READ(kind, data, fill); 13274 13275 if (chr == '+' || chr == '-') { 13276 /* move sign to beginning of string */ 13277 PyUnicode_WRITE(kind, data, 0, chr); 13278 PyUnicode_WRITE(kind, data, fill, '0'); 13279 } 13280 13281 assert(_PyUnicode_CheckConsistency(u, 1)); 13282 return u; 13283 } 13284 13285 #if 0 13286 static PyObject * 13287 unicode__decimal2ascii(PyObject *self) 13288 { 13289 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13290 } 13291 #endif 13292 13293 PyDoc_STRVAR(startswith__doc__, 13294 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13295 \n\ 13296 Return True if S starts with the specified prefix, False otherwise.\n\ 13297 With optional start, test S beginning at that position.\n\ 13298 With optional end, stop comparing S at that position.\n\ 13299 prefix can also be a tuple of strings to try."); 13300 13301 static PyObject * 13302 unicode_startswith(PyObject *self, 13303 PyObject *args) 13304 { 13305 PyObject *subobj; 13306 PyObject *substring; 13307 Py_ssize_t start = 0; 13308 Py_ssize_t end = PY_SSIZE_T_MAX; 13309 int result; 13310 13311 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13312 return NULL; 13313 if (PyTuple_Check(subobj)) { 13314 Py_ssize_t i; 13315 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13316 substring = PyTuple_GET_ITEM(subobj, i); 13317 if (!PyUnicode_Check(substring)) { 13318 PyErr_Format(PyExc_TypeError, 13319 "tuple for startswith must only contain str, " 13320 "not %.100s", 13321 Py_TYPE(substring)->tp_name); 13322 return NULL; 13323 } 13324 result = tailmatch(self, substring, start, end, -1); 13325 if (result == -1) 13326 return NULL; 13327 if (result) { 13328 Py_RETURN_TRUE; 13329 } 13330 } 13331 /* nothing matched */ 13332 Py_RETURN_FALSE; 13333 } 13334 if (!PyUnicode_Check(subobj)) { 13335 PyErr_Format(PyExc_TypeError, 13336 "startswith first arg must be str or " 13337 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13338 return NULL; 13339 } 13340 result = tailmatch(self, subobj, start, end, -1); 13341 if (result == -1) 13342 return NULL; 13343 return PyBool_FromLong(result); 13344 } 13345 13346 13347 PyDoc_STRVAR(endswith__doc__, 13348 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13349 \n\ 13350 Return True if S ends with the specified suffix, False otherwise.\n\ 13351 With optional start, test S beginning at that position.\n\ 13352 With optional end, stop comparing S at that position.\n\ 13353 suffix can also be a tuple of strings to try."); 13354 13355 static PyObject * 13356 unicode_endswith(PyObject *self, 13357 PyObject *args) 13358 { 13359 PyObject *subobj; 13360 PyObject *substring; 13361 Py_ssize_t start = 0; 13362 Py_ssize_t end = PY_SSIZE_T_MAX; 13363 int result; 13364 13365 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13366 return NULL; 13367 if (PyTuple_Check(subobj)) { 13368 Py_ssize_t i; 13369 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13370 substring = PyTuple_GET_ITEM(subobj, i); 13371 if (!PyUnicode_Check(substring)) { 13372 PyErr_Format(PyExc_TypeError, 13373 "tuple for endswith must only contain str, " 13374 "not %.100s", 13375 Py_TYPE(substring)->tp_name); 13376 return NULL; 13377 } 13378 result = tailmatch(self, substring, start, end, +1); 13379 if (result == -1) 13380 return NULL; 13381 if (result) { 13382 Py_RETURN_TRUE; 13383 } 13384 } 13385 Py_RETURN_FALSE; 13386 } 13387 if (!PyUnicode_Check(subobj)) { 13388 PyErr_Format(PyExc_TypeError, 13389 "endswith first arg must be str or " 13390 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13391 return NULL; 13392 } 13393 result = tailmatch(self, subobj, start, end, +1); 13394 if (result == -1) 13395 return NULL; 13396 return PyBool_FromLong(result); 13397 } 13398 13399 static inline void 13400 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13401 { 13402 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13403 writer->data = PyUnicode_DATA(writer->buffer); 13404 13405 if (!writer->readonly) { 13406 writer->kind = PyUnicode_KIND(writer->buffer); 13407 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13408 } 13409 else { 13410 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13411 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13412 writer->kind = PyUnicode_WCHAR_KIND; 13413 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13414 13415 /* Copy-on-write mode: set buffer size to 0 so 13416 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13417 * next write. */ 13418 writer->size = 0; 13419 } 13420 } 13421 13422 void 13423 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13424 { 13425 memset(writer, 0, sizeof(*writer)); 13426 13427 /* ASCII is the bare minimum */ 13428 writer->min_char = 127; 13429 13430 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13431 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13432 writer->kind = PyUnicode_WCHAR_KIND; 13433 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13434 } 13435 13436 int 13437 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13438 Py_ssize_t length, Py_UCS4 maxchar) 13439 { 13440 Py_ssize_t newlen; 13441 PyObject *newbuffer; 13442 13443 assert(maxchar <= MAX_UNICODE); 13444 13445 /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13446 assert((maxchar > writer->maxchar && length >= 0) 13447 || length > 0); 13448 13449 if (length > PY_SSIZE_T_MAX - writer->pos) { 13450 PyErr_NoMemory(); 13451 return -1; 13452 } 13453 newlen = writer->pos + length; 13454 13455 maxchar = Py_MAX(maxchar, writer->min_char); 13456 13457 if (writer->buffer == NULL) { 13458 assert(!writer->readonly); 13459 if (writer->overallocate 13460 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13461 /* overallocate to limit the number of realloc() */ 13462 newlen += newlen / OVERALLOCATE_FACTOR; 13463 } 13464 if (newlen < writer->min_length) 13465 newlen = writer->min_length; 13466 13467 writer->buffer = PyUnicode_New(newlen, maxchar); 13468 if (writer->buffer == NULL) 13469 return -1; 13470 } 13471 else if (newlen > writer->size) { 13472 if (writer->overallocate 13473 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13474 /* overallocate to limit the number of realloc() */ 13475 newlen += newlen / OVERALLOCATE_FACTOR; 13476 } 13477 if (newlen < writer->min_length) 13478 newlen = writer->min_length; 13479 13480 if (maxchar > writer->maxchar || writer->readonly) { 13481 /* resize + widen */ 13482 maxchar = Py_MAX(maxchar, writer->maxchar); 13483 newbuffer = PyUnicode_New(newlen, maxchar); 13484 if (newbuffer == NULL) 13485 return -1; 13486 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13487 writer->buffer, 0, writer->pos); 13488 Py_DECREF(writer->buffer); 13489 writer->readonly = 0; 13490 } 13491 else { 13492 newbuffer = resize_compact(writer->buffer, newlen); 13493 if (newbuffer == NULL) 13494 return -1; 13495 } 13496 writer->buffer = newbuffer; 13497 } 13498 else if (maxchar > writer->maxchar) { 13499 assert(!writer->readonly); 13500 newbuffer = PyUnicode_New(writer->size, maxchar); 13501 if (newbuffer == NULL) 13502 return -1; 13503 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13504 writer->buffer, 0, writer->pos); 13505 Py_SETREF(writer->buffer, newbuffer); 13506 } 13507 _PyUnicodeWriter_Update(writer); 13508 return 0; 13509 13510 #undef OVERALLOCATE_FACTOR 13511 } 13512 13513 int 13514 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13515 enum PyUnicode_Kind kind) 13516 { 13517 Py_UCS4 maxchar; 13518 13519 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13520 assert(writer->kind < kind); 13521 13522 switch (kind) 13523 { 13524 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13525 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13526 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; 13527 default: 13528 assert(0 && "invalid kind"); 13529 return -1; 13530 } 13531 13532 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13533 } 13534 13535 static inline int 13536 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13537 { 13538 assert(ch <= MAX_UNICODE); 13539 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13540 return -1; 13541 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13542 writer->pos++; 13543 return 0; 13544 } 13545 13546 int 13547 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13548 { 13549 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13550 } 13551 13552 int 13553 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13554 { 13555 Py_UCS4 maxchar; 13556 Py_ssize_t len; 13557 13558 if (PyUnicode_READY(str) == -1) 13559 return -1; 13560 len = PyUnicode_GET_LENGTH(str); 13561 if (len == 0) 13562 return 0; 13563 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13564 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13565 if (writer->buffer == NULL && !writer->overallocate) { 13566 assert(_PyUnicode_CheckConsistency(str, 1)); 13567 writer->readonly = 1; 13568 Py_INCREF(str); 13569 writer->buffer = str; 13570 _PyUnicodeWriter_Update(writer); 13571 writer->pos += len; 13572 return 0; 13573 } 13574 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13575 return -1; 13576 } 13577 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13578 str, 0, len); 13579 writer->pos += len; 13580 return 0; 13581 } 13582 13583 int 13584 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13585 Py_ssize_t start, Py_ssize_t end) 13586 { 13587 Py_UCS4 maxchar; 13588 Py_ssize_t len; 13589 13590 if (PyUnicode_READY(str) == -1) 13591 return -1; 13592 13593 assert(0 <= start); 13594 assert(end <= PyUnicode_GET_LENGTH(str)); 13595 assert(start <= end); 13596 13597 if (end == 0) 13598 return 0; 13599 13600 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13601 return _PyUnicodeWriter_WriteStr(writer, str); 13602 13603 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13604 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13605 else 13606 maxchar = writer->maxchar; 13607 len = end - start; 13608 13609 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13610 return -1; 13611 13612 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13613 str, start, len); 13614 writer->pos += len; 13615 return 0; 13616 } 13617 13618 int 13619 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13620 const char *ascii, Py_ssize_t len) 13621 { 13622 if (len == -1) 13623 len = strlen(ascii); 13624 13625 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13626 13627 if (writer->buffer == NULL && !writer->overallocate) { 13628 PyObject *str; 13629 13630 str = _PyUnicode_FromASCII(ascii, len); 13631 if (str == NULL) 13632 return -1; 13633 13634 writer->readonly = 1; 13635 writer->buffer = str; 13636 _PyUnicodeWriter_Update(writer); 13637 writer->pos += len; 13638 return 0; 13639 } 13640 13641 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13642 return -1; 13643 13644 switch (writer->kind) 13645 { 13646 case PyUnicode_1BYTE_KIND: 13647 { 13648 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13649 Py_UCS1 *data = writer->data; 13650 13651 memcpy(data + writer->pos, str, len); 13652 break; 13653 } 13654 case PyUnicode_2BYTE_KIND: 13655 { 13656 _PyUnicode_CONVERT_BYTES( 13657 Py_UCS1, Py_UCS2, 13658 ascii, ascii + len, 13659 (Py_UCS2 *)writer->data + writer->pos); 13660 break; 13661 } 13662 case PyUnicode_4BYTE_KIND: 13663 { 13664 _PyUnicode_CONVERT_BYTES( 13665 Py_UCS1, Py_UCS4, 13666 ascii, ascii + len, 13667 (Py_UCS4 *)writer->data + writer->pos); 13668 break; 13669 } 13670 default: 13671 assert(0); 13672 } 13673 13674 writer->pos += len; 13675 return 0; 13676 } 13677 13678 int 13679 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13680 const char *str, Py_ssize_t len) 13681 { 13682 Py_UCS4 maxchar; 13683 13684 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13685 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13686 return -1; 13687 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13688 writer->pos += len; 13689 return 0; 13690 } 13691 13692 PyObject * 13693 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13694 { 13695 PyObject *str; 13696 13697 if (writer->pos == 0) { 13698 Py_CLEAR(writer->buffer); 13699 _Py_RETURN_UNICODE_EMPTY(); 13700 } 13701 13702 str = writer->buffer; 13703 writer->buffer = NULL; 13704 13705 if (writer->readonly) { 13706 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13707 return str; 13708 } 13709 13710 if (PyUnicode_GET_LENGTH(str) != writer->pos) { 13711 PyObject *str2; 13712 str2 = resize_compact(str, writer->pos); 13713 if (str2 == NULL) { 13714 Py_DECREF(str); 13715 return NULL; 13716 } 13717 str = str2; 13718 } 13719 13720 assert(_PyUnicode_CheckConsistency(str, 1)); 13721 return unicode_result_ready(str); 13722 } 13723 13724 void 13725 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13726 { 13727 Py_CLEAR(writer->buffer); 13728 } 13729 13730 #include "stringlib/unicode_format.h" 13731 13732 PyDoc_STRVAR(format__doc__, 13733 "S.format(*args, **kwargs) -> str\n\ 13734 \n\ 13735 Return a formatted version of S, using substitutions from args and kwargs.\n\ 13736 The substitutions are identified by braces ('{' and '}')."); 13737 13738 PyDoc_STRVAR(format_map__doc__, 13739 "S.format_map(mapping) -> str\n\ 13740 \n\ 13741 Return a formatted version of S, using substitutions from mapping.\n\ 13742 The substitutions are identified by braces ('{' and '}')."); 13743 13744 static PyObject * 13745 unicode__format__(PyObject* self, PyObject* args) 13746 { 13747 PyObject *format_spec; 13748 _PyUnicodeWriter writer; 13749 int ret; 13750 13751 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13752 return NULL; 13753 13754 if (PyUnicode_READY(self) == -1) 13755 return NULL; 13756 _PyUnicodeWriter_Init(&writer); 13757 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13758 self, format_spec, 0, 13759 PyUnicode_GET_LENGTH(format_spec)); 13760 if (ret == -1) { 13761 _PyUnicodeWriter_Dealloc(&writer); 13762 return NULL; 13763 } 13764 return _PyUnicodeWriter_Finish(&writer); 13765 } 13766 13767 PyDoc_STRVAR(p_format__doc__, 13768 "S.__format__(format_spec) -> str\n\ 13769 \n\ 13770 Return a formatted version of S as described by format_spec."); 13771 13772 static PyObject * 13773 unicode__sizeof__(PyObject *v) 13774 { 13775 Py_ssize_t size; 13776 13777 /* If it's a compact object, account for base structure + 13778 character data. */ 13779 if (PyUnicode_IS_COMPACT_ASCII(v)) 13780 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13781 else if (PyUnicode_IS_COMPACT(v)) 13782 size = sizeof(PyCompactUnicodeObject) + 13783 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13784 else { 13785 /* If it is a two-block object, account for base object, and 13786 for character block if present. */ 13787 size = sizeof(PyUnicodeObject); 13788 if (_PyUnicode_DATA_ANY(v)) 13789 size += (PyUnicode_GET_LENGTH(v) + 1) * 13790 PyUnicode_KIND(v); 13791 } 13792 /* If the wstr pointer is present, account for it unless it is shared 13793 with the data pointer. Check if the data is not shared. */ 13794 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13795 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13796 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13797 size += PyUnicode_UTF8_LENGTH(v) + 1; 13798 13799 return PyLong_FromSsize_t(size); 13800 } 13801 13802 PyDoc_STRVAR(sizeof__doc__, 13803 "S.__sizeof__() -> size of S in memory, in bytes"); 13804 13805 static PyObject * 13806 unicode_getnewargs(PyObject *v) 13807 { 13808 PyObject *copy = _PyUnicode_Copy(v); 13809 if (!copy) 13810 return NULL; 13811 return Py_BuildValue("(N)", copy); 13812 } 13813 13814 static PyMethodDef unicode_methods[] = { 13815 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13816 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13817 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13818 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13819 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13820 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13821 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13822 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13823 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13824 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13825 {"expandtabs", (PyCFunction) unicode_expandtabs, 13826 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13827 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13828 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13829 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13830 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13831 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13832 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13833 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13834 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13835 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13836 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13837 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13838 {"splitlines", (PyCFunction) unicode_splitlines, 13839 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13840 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13841 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13842 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13843 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13844 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13845 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13846 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13847 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13848 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13849 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13850 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13851 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13852 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13853 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13854 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13855 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13856 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13857 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13858 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13859 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13860 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13861 UNICODE_MAKETRANS_METHODDEF 13862 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13863 #if 0 13864 /* These methods are just used for debugging the implementation. */ 13865 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13866 #endif 13867 13868 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13869 {NULL, NULL} 13870 }; 13871 13872 static PyObject * 13873 unicode_mod(PyObject *v, PyObject *w) 13874 { 13875 if (!PyUnicode_Check(v)) 13876 Py_RETURN_NOTIMPLEMENTED; 13877 return PyUnicode_Format(v, w); 13878 } 13879 13880 static PyNumberMethods unicode_as_number = { 13881 0, /*nb_add*/ 13882 0, /*nb_subtract*/ 13883 0, /*nb_multiply*/ 13884 unicode_mod, /*nb_remainder*/ 13885 }; 13886 13887 static PySequenceMethods unicode_as_sequence = { 13888 (lenfunc) unicode_length, /* sq_length */ 13889 PyUnicode_Concat, /* sq_concat */ 13890 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13891 (ssizeargfunc) unicode_getitem, /* sq_item */ 13892 0, /* sq_slice */ 13893 0, /* sq_ass_item */ 13894 0, /* sq_ass_slice */ 13895 PyUnicode_Contains, /* sq_contains */ 13896 }; 13897 13898 static PyObject* 13899 unicode_subscript(PyObject* self, PyObject* item) 13900 { 13901 if (PyUnicode_READY(self) == -1) 13902 return NULL; 13903 13904 if (PyIndex_Check(item)) { 13905 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13906 if (i == -1 && PyErr_Occurred()) 13907 return NULL; 13908 if (i < 0) 13909 i += PyUnicode_GET_LENGTH(self); 13910 return unicode_getitem(self, i); 13911 } else if (PySlice_Check(item)) { 13912 Py_ssize_t start, stop, step, slicelength, cur, i; 13913 PyObject *result; 13914 void *src_data, *dest_data; 13915 int src_kind, dest_kind; 13916 Py_UCS4 ch, max_char, kind_limit; 13917 13918 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13919 &start, &stop, &step, &slicelength) < 0) { 13920 return NULL; 13921 } 13922 13923 if (slicelength <= 0) { 13924 _Py_RETURN_UNICODE_EMPTY(); 13925 } else if (start == 0 && step == 1 && 13926 slicelength == PyUnicode_GET_LENGTH(self)) { 13927 return unicode_result_unchanged(self); 13928 } else if (step == 1) { 13929 return PyUnicode_Substring(self, 13930 start, start + slicelength); 13931 } 13932 /* General case */ 13933 src_kind = PyUnicode_KIND(self); 13934 src_data = PyUnicode_DATA(self); 13935 if (!PyUnicode_IS_ASCII(self)) { 13936 kind_limit = kind_maxchar_limit(src_kind); 13937 max_char = 0; 13938 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13939 ch = PyUnicode_READ(src_kind, src_data, cur); 13940 if (ch > max_char) { 13941 max_char = ch; 13942 if (max_char >= kind_limit) 13943 break; 13944 } 13945 } 13946 } 13947 else 13948 max_char = 127; 13949 result = PyUnicode_New(slicelength, max_char); 13950 if (result == NULL) 13951 return NULL; 13952 dest_kind = PyUnicode_KIND(result); 13953 dest_data = PyUnicode_DATA(result); 13954 13955 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13956 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13957 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13958 } 13959 assert(_PyUnicode_CheckConsistency(result, 1)); 13960 return result; 13961 } else { 13962 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13963 return NULL; 13964 } 13965 } 13966 13967 static PyMappingMethods unicode_as_mapping = { 13968 (lenfunc)unicode_length, /* mp_length */ 13969 (binaryfunc)unicode_subscript, /* mp_subscript */ 13970 (objobjargproc)0, /* mp_ass_subscript */ 13971 }; 13972 13973 13974 /* Helpers for PyUnicode_Format() */ 13975 13976 struct unicode_formatter_t { 13977 PyObject *args; 13978 int args_owned; 13979 Py_ssize_t arglen, argidx; 13980 PyObject *dict; 13981 13982 enum PyUnicode_Kind fmtkind; 13983 Py_ssize_t fmtcnt, fmtpos; 13984 void *fmtdata; 13985 PyObject *fmtstr; 13986 13987 _PyUnicodeWriter writer; 13988 }; 13989 13990 struct unicode_format_arg_t { 13991 Py_UCS4 ch; 13992 int flags; 13993 Py_ssize_t width; 13994 int prec; 13995 int sign; 13996 }; 13997 13998 static PyObject * 13999 unicode_format_getnextarg(struct unicode_formatter_t *ctx) 14000 { 14001 Py_ssize_t argidx = ctx->argidx; 14002 14003 if (argidx < ctx->arglen) { 14004 ctx->argidx++; 14005 if (ctx->arglen < 0) 14006 return ctx->args; 14007 else 14008 return PyTuple_GetItem(ctx->args, argidx); 14009 } 14010 PyErr_SetString(PyExc_TypeError, 14011 "not enough arguments for format string"); 14012 return NULL; 14013 } 14014 14015 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 14016 14017 /* Format a float into the writer if the writer is not NULL, or into *p_output 14018 otherwise. 14019 14020 Return 0 on success, raise an exception and return -1 on error. */ 14021 static int 14022 formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 14023 PyObject **p_output, 14024 _PyUnicodeWriter *writer) 14025 { 14026 char *p; 14027 double x; 14028 Py_ssize_t len; 14029 int prec; 14030 int dtoa_flags; 14031 14032 x = PyFloat_AsDouble(v); 14033 if (x == -1.0 && PyErr_Occurred()) 14034 return -1; 14035 14036 prec = arg->prec; 14037 if (prec < 0) 14038 prec = 6; 14039 14040 if (arg->flags & F_ALT) 14041 dtoa_flags = Py_DTSF_ALT; 14042 else 14043 dtoa_flags = 0; 14044 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 14045 if (p == NULL) 14046 return -1; 14047 len = strlen(p); 14048 if (writer) { 14049 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 14050 PyMem_Free(p); 14051 return -1; 14052 } 14053 } 14054 else 14055 *p_output = _PyUnicode_FromASCII(p, len); 14056 PyMem_Free(p); 14057 return 0; 14058 } 14059 14060 /* formatlong() emulates the format codes d, u, o, x and X, and 14061 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 14062 * Python's regular ints. 14063 * Return value: a new PyUnicodeObject*, or NULL if error. 14064 * The output string is of the form 14065 * "-"? ("0x" | "0X")? digit+ 14066 * "0x"/"0X" are present only for x and X conversions, with F_ALT 14067 * set in flags. The case of hex digits will be correct, 14068 * There will be at least prec digits, zero-filled on the left if 14069 * necessary to get that many. 14070 * val object to be converted 14071 * flags bitmask of format flags; only F_ALT is looked at 14072 * prec minimum number of digits; 0-fill on left if needed 14073 * type a character in [duoxX]; u acts the same as d 14074 * 14075 * CAUTION: o, x and X conversions on regular ints can never 14076 * produce a '-' sign, but can for Python's unbounded ints. 14077 */ 14078 PyObject * 14079 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 14080 { 14081 PyObject *result = NULL; 14082 char *buf; 14083 Py_ssize_t i; 14084 int sign; /* 1 if '-', else 0 */ 14085 int len; /* number of characters */ 14086 Py_ssize_t llen; 14087 int numdigits; /* len == numnondigits + numdigits */ 14088 int numnondigits = 0; 14089 14090 /* Avoid exceeding SSIZE_T_MAX */ 14091 if (prec > INT_MAX-3) { 14092 PyErr_SetString(PyExc_OverflowError, 14093 "precision too large"); 14094 return NULL; 14095 } 14096 14097 assert(PyLong_Check(val)); 14098 14099 switch (type) { 14100 default: 14101 assert(!"'type' not in [diuoxX]"); 14102 case 'd': 14103 case 'i': 14104 case 'u': 14105 /* int and int subclasses should print numerically when a numeric */ 14106 /* format code is used (see issue18780) */ 14107 result = PyNumber_ToBase(val, 10); 14108 break; 14109 case 'o': 14110 numnondigits = 2; 14111 result = PyNumber_ToBase(val, 8); 14112 break; 14113 case 'x': 14114 case 'X': 14115 numnondigits = 2; 14116 result = PyNumber_ToBase(val, 16); 14117 break; 14118 } 14119 if (!result) 14120 return NULL; 14121 14122 assert(unicode_modifiable(result)); 14123 assert(PyUnicode_IS_READY(result)); 14124 assert(PyUnicode_IS_ASCII(result)); 14125 14126 /* To modify the string in-place, there can only be one reference. */ 14127 if (Py_REFCNT(result) != 1) { 14128 Py_DECREF(result); 14129 PyErr_BadInternalCall(); 14130 return NULL; 14131 } 14132 buf = PyUnicode_DATA(result); 14133 llen = PyUnicode_GET_LENGTH(result); 14134 if (llen > INT_MAX) { 14135 Py_DECREF(result); 14136 PyErr_SetString(PyExc_ValueError, 14137 "string too large in _PyUnicode_FormatLong"); 14138 return NULL; 14139 } 14140 len = (int)llen; 14141 sign = buf[0] == '-'; 14142 numnondigits += sign; 14143 numdigits = len - numnondigits; 14144 assert(numdigits > 0); 14145 14146 /* Get rid of base marker unless F_ALT */ 14147 if (((alt) == 0 && 14148 (type == 'o' || type == 'x' || type == 'X'))) { 14149 assert(buf[sign] == '0'); 14150 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14151 buf[sign+1] == 'o'); 14152 numnondigits -= 2; 14153 buf += 2; 14154 len -= 2; 14155 if (sign) 14156 buf[0] = '-'; 14157 assert(len == numnondigits + numdigits); 14158 assert(numdigits > 0); 14159 } 14160 14161 /* Fill with leading zeroes to meet minimum width. */ 14162 if (prec > numdigits) { 14163 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14164 numnondigits + prec); 14165 char *b1; 14166 if (!r1) { 14167 Py_DECREF(result); 14168 return NULL; 14169 } 14170 b1 = PyBytes_AS_STRING(r1); 14171 for (i = 0; i < numnondigits; ++i) 14172 *b1++ = *buf++; 14173 for (i = 0; i < prec - numdigits; i++) 14174 *b1++ = '0'; 14175 for (i = 0; i < numdigits; i++) 14176 *b1++ = *buf++; 14177 *b1 = '\0'; 14178 Py_DECREF(result); 14179 result = r1; 14180 buf = PyBytes_AS_STRING(result); 14181 len = numnondigits + prec; 14182 } 14183 14184 /* Fix up case for hex conversions. */ 14185 if (type == 'X') { 14186 /* Need to convert all lower case letters to upper case. 14187 and need to convert 0x to 0X (and -0x to -0X). */ 14188 for (i = 0; i < len; i++) 14189 if (buf[i] >= 'a' && buf[i] <= 'x') 14190 buf[i] -= 'a'-'A'; 14191 } 14192 if (!PyUnicode_Check(result) 14193 || buf != PyUnicode_DATA(result)) { 14194 PyObject *unicode; 14195 unicode = _PyUnicode_FromASCII(buf, len); 14196 Py_DECREF(result); 14197 result = unicode; 14198 } 14199 else if (len != PyUnicode_GET_LENGTH(result)) { 14200 if (PyUnicode_Resize(&result, len) < 0) 14201 Py_CLEAR(result); 14202 } 14203 return result; 14204 } 14205 14206 /* Format an integer or a float as an integer. 14207 * Return 1 if the number has been formatted into the writer, 14208 * 0 if the number has been formatted into *p_output 14209 * -1 and raise an exception on error */ 14210 static int 14211 mainformatlong(PyObject *v, 14212 struct unicode_format_arg_t *arg, 14213 PyObject **p_output, 14214 _PyUnicodeWriter *writer) 14215 { 14216 PyObject *iobj, *res; 14217 char type = (char)arg->ch; 14218 14219 if (!PyNumber_Check(v)) 14220 goto wrongtype; 14221 14222 /* make sure number is a type of integer for o, x, and X */ 14223 if (!PyLong_Check(v)) { 14224 if (type == 'o' || type == 'x' || type == 'X') { 14225 iobj = PyNumber_Index(v); 14226 if (iobj == NULL) { 14227 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14228 goto wrongtype; 14229 return -1; 14230 } 14231 } 14232 else { 14233 iobj = PyNumber_Long(v); 14234 if (iobj == NULL ) { 14235 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14236 goto wrongtype; 14237 return -1; 14238 } 14239 } 14240 assert(PyLong_Check(iobj)); 14241 } 14242 else { 14243 iobj = v; 14244 Py_INCREF(iobj); 14245 } 14246 14247 if (PyLong_CheckExact(v) 14248 && arg->width == -1 && arg->prec == -1 14249 && !(arg->flags & (F_SIGN | F_BLANK)) 14250 && type != 'X') 14251 { 14252 /* Fast path */ 14253 int alternate = arg->flags & F_ALT; 14254 int base; 14255 14256 switch(type) 14257 { 14258 default: 14259 assert(0 && "'type' not in [diuoxX]"); 14260 case 'd': 14261 case 'i': 14262 case 'u': 14263 base = 10; 14264 break; 14265 case 'o': 14266 base = 8; 14267 break; 14268 case 'x': 14269 case 'X': 14270 base = 16; 14271 break; 14272 } 14273 14274 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14275 Py_DECREF(iobj); 14276 return -1; 14277 } 14278 Py_DECREF(iobj); 14279 return 1; 14280 } 14281 14282 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14283 Py_DECREF(iobj); 14284 if (res == NULL) 14285 return -1; 14286 *p_output = res; 14287 return 0; 14288 14289 wrongtype: 14290 switch(type) 14291 { 14292 case 'o': 14293 case 'x': 14294 case 'X': 14295 PyErr_Format(PyExc_TypeError, 14296 "%%%c format: an integer is required, " 14297 "not %.200s", 14298 type, Py_TYPE(v)->tp_name); 14299 break; 14300 default: 14301 PyErr_Format(PyExc_TypeError, 14302 "%%%c format: a number is required, " 14303 "not %.200s", 14304 type, Py_TYPE(v)->tp_name); 14305 break; 14306 } 14307 return -1; 14308 } 14309 14310 static Py_UCS4 14311 formatchar(PyObject *v) 14312 { 14313 /* presume that the buffer is at least 3 characters long */ 14314 if (PyUnicode_Check(v)) { 14315 if (PyUnicode_GET_LENGTH(v) == 1) { 14316 return PyUnicode_READ_CHAR(v, 0); 14317 } 14318 goto onError; 14319 } 14320 else { 14321 PyObject *iobj; 14322 long x; 14323 /* make sure number is a type of integer */ 14324 if (!PyLong_Check(v)) { 14325 iobj = PyNumber_Index(v); 14326 if (iobj == NULL) { 14327 goto onError; 14328 } 14329 x = PyLong_AsLong(iobj); 14330 Py_DECREF(iobj); 14331 } 14332 else { 14333 x = PyLong_AsLong(v); 14334 } 14335 if (x == -1 && PyErr_Occurred()) 14336 goto onError; 14337 14338 if (x < 0 || x > MAX_UNICODE) { 14339 PyErr_SetString(PyExc_OverflowError, 14340 "%c arg not in range(0x110000)"); 14341 return (Py_UCS4) -1; 14342 } 14343 14344 return (Py_UCS4) x; 14345 } 14346 14347 onError: 14348 PyErr_SetString(PyExc_TypeError, 14349 "%c requires int or char"); 14350 return (Py_UCS4) -1; 14351 } 14352 14353 /* Parse options of an argument: flags, width, precision. 14354 Handle also "%(name)" syntax. 14355 14356 Return 0 if the argument has been formatted into arg->str. 14357 Return 1 if the argument has been written into ctx->writer, 14358 Raise an exception and return -1 on error. */ 14359 static int 14360 unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14361 struct unicode_format_arg_t *arg) 14362 { 14363 #define FORMAT_READ(ctx) \ 14364 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14365 14366 PyObject *v; 14367 14368 if (arg->ch == '(') { 14369 /* Get argument value from a dictionary. Example: "%(name)s". */ 14370 Py_ssize_t keystart; 14371 Py_ssize_t keylen; 14372 PyObject *key; 14373 int pcount = 1; 14374 14375 if (ctx->dict == NULL) { 14376 PyErr_SetString(PyExc_TypeError, 14377 "format requires a mapping"); 14378 return -1; 14379 } 14380 ++ctx->fmtpos; 14381 --ctx->fmtcnt; 14382 keystart = ctx->fmtpos; 14383 /* Skip over balanced parentheses */ 14384 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14385 arg->ch = FORMAT_READ(ctx); 14386 if (arg->ch == ')') 14387 --pcount; 14388 else if (arg->ch == '(') 14389 ++pcount; 14390 ctx->fmtpos++; 14391 } 14392 keylen = ctx->fmtpos - keystart - 1; 14393 if (ctx->fmtcnt < 0 || pcount > 0) { 14394 PyErr_SetString(PyExc_ValueError, 14395 "incomplete format key"); 14396 return -1; 14397 } 14398 key = PyUnicode_Substring(ctx->fmtstr, 14399 keystart, keystart + keylen); 14400 if (key == NULL) 14401 return -1; 14402 if (ctx->args_owned) { 14403 ctx->args_owned = 0; 14404 Py_DECREF(ctx->args); 14405 } 14406 ctx->args = PyObject_GetItem(ctx->dict, key); 14407 Py_DECREF(key); 14408 if (ctx->args == NULL) 14409 return -1; 14410 ctx->args_owned = 1; 14411 ctx->arglen = -1; 14412 ctx->argidx = -2; 14413 } 14414 14415 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14416 while (--ctx->fmtcnt >= 0) { 14417 arg->ch = FORMAT_READ(ctx); 14418 ctx->fmtpos++; 14419 switch (arg->ch) { 14420 case '-': arg->flags |= F_LJUST; continue; 14421 case '+': arg->flags |= F_SIGN; continue; 14422 case ' ': arg->flags |= F_BLANK; continue; 14423 case '#': arg->flags |= F_ALT; continue; 14424 case '0': arg->flags |= F_ZERO; continue; 14425 } 14426 break; 14427 } 14428 14429 /* Parse width. Example: "%10s" => width=10 */ 14430 if (arg->ch == '*') { 14431 v = unicode_format_getnextarg(ctx); 14432 if (v == NULL) 14433 return -1; 14434 if (!PyLong_Check(v)) { 14435 PyErr_SetString(PyExc_TypeError, 14436 "* wants int"); 14437 return -1; 14438 } 14439 arg->width = PyLong_AsSsize_t(v); 14440 if (arg->width == -1 && PyErr_Occurred()) 14441 return -1; 14442 if (arg->width < 0) { 14443 arg->flags |= F_LJUST; 14444 arg->width = -arg->width; 14445 } 14446 if (--ctx->fmtcnt >= 0) { 14447 arg->ch = FORMAT_READ(ctx); 14448 ctx->fmtpos++; 14449 } 14450 } 14451 else if (arg->ch >= '0' && arg->ch <= '9') { 14452 arg->width = arg->ch - '0'; 14453 while (--ctx->fmtcnt >= 0) { 14454 arg->ch = FORMAT_READ(ctx); 14455 ctx->fmtpos++; 14456 if (arg->ch < '0' || arg->ch > '9') 14457 break; 14458 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14459 mixing signed and unsigned comparison. Since arg->ch is between 14460 '0' and '9', casting to int is safe. */ 14461 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14462 PyErr_SetString(PyExc_ValueError, 14463 "width too big"); 14464 return -1; 14465 } 14466 arg->width = arg->width*10 + (arg->ch - '0'); 14467 } 14468 } 14469 14470 /* Parse precision. Example: "%.3f" => prec=3 */ 14471 if (arg->ch == '.') { 14472 arg->prec = 0; 14473 if (--ctx->fmtcnt >= 0) { 14474 arg->ch = FORMAT_READ(ctx); 14475 ctx->fmtpos++; 14476 } 14477 if (arg->ch == '*') { 14478 v = unicode_format_getnextarg(ctx); 14479 if (v == NULL) 14480 return -1; 14481 if (!PyLong_Check(v)) { 14482 PyErr_SetString(PyExc_TypeError, 14483 "* wants int"); 14484 return -1; 14485 } 14486 arg->prec = _PyLong_AsInt(v); 14487 if (arg->prec == -1 && PyErr_Occurred()) 14488 return -1; 14489 if (arg->prec < 0) 14490 arg->prec = 0; 14491 if (--ctx->fmtcnt >= 0) { 14492 arg->ch = FORMAT_READ(ctx); 14493 ctx->fmtpos++; 14494 } 14495 } 14496 else if (arg->ch >= '0' && arg->ch <= '9') { 14497 arg->prec = arg->ch - '0'; 14498 while (--ctx->fmtcnt >= 0) { 14499 arg->ch = FORMAT_READ(ctx); 14500 ctx->fmtpos++; 14501 if (arg->ch < '0' || arg->ch > '9') 14502 break; 14503 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14504 PyErr_SetString(PyExc_ValueError, 14505 "precision too big"); 14506 return -1; 14507 } 14508 arg->prec = arg->prec*10 + (arg->ch - '0'); 14509 } 14510 } 14511 } 14512 14513 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14514 if (ctx->fmtcnt >= 0) { 14515 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14516 if (--ctx->fmtcnt >= 0) { 14517 arg->ch = FORMAT_READ(ctx); 14518 ctx->fmtpos++; 14519 } 14520 } 14521 } 14522 if (ctx->fmtcnt < 0) { 14523 PyErr_SetString(PyExc_ValueError, 14524 "incomplete format"); 14525 return -1; 14526 } 14527 return 0; 14528 14529 #undef FORMAT_READ 14530 } 14531 14532 /* Format one argument. Supported conversion specifiers: 14533 14534 - "s", "r", "a": any type 14535 - "i", "d", "u": int or float 14536 - "o", "x", "X": int 14537 - "e", "E", "f", "F", "g", "G": float 14538 - "c": int or str (1 character) 14539 14540 When possible, the output is written directly into the Unicode writer 14541 (ctx->writer). A string is created when padding is required. 14542 14543 Return 0 if the argument has been formatted into *p_str, 14544 1 if the argument has been written into ctx->writer, 14545 -1 on error. */ 14546 static int 14547 unicode_format_arg_format(struct unicode_formatter_t *ctx, 14548 struct unicode_format_arg_t *arg, 14549 PyObject **p_str) 14550 { 14551 PyObject *v; 14552 _PyUnicodeWriter *writer = &ctx->writer; 14553 14554 if (ctx->fmtcnt == 0) 14555 ctx->writer.overallocate = 0; 14556 14557 if (arg->ch == '%') { 14558 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14559 return -1; 14560 return 1; 14561 } 14562 14563 v = unicode_format_getnextarg(ctx); 14564 if (v == NULL) 14565 return -1; 14566 14567 14568 switch (arg->ch) { 14569 case 's': 14570 case 'r': 14571 case 'a': 14572 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14573 /* Fast path */ 14574 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14575 return -1; 14576 return 1; 14577 } 14578 14579 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14580 *p_str = v; 14581 Py_INCREF(*p_str); 14582 } 14583 else { 14584 if (arg->ch == 's') 14585 *p_str = PyObject_Str(v); 14586 else if (arg->ch == 'r') 14587 *p_str = PyObject_Repr(v); 14588 else 14589 *p_str = PyObject_ASCII(v); 14590 } 14591 break; 14592 14593 case 'i': 14594 case 'd': 14595 case 'u': 14596 case 'o': 14597 case 'x': 14598 case 'X': 14599 { 14600 int ret = mainformatlong(v, arg, p_str, writer); 14601 if (ret != 0) 14602 return ret; 14603 arg->sign = 1; 14604 break; 14605 } 14606 14607 case 'e': 14608 case 'E': 14609 case 'f': 14610 case 'F': 14611 case 'g': 14612 case 'G': 14613 if (arg->width == -1 && arg->prec == -1 14614 && !(arg->flags & (F_SIGN | F_BLANK))) 14615 { 14616 /* Fast path */ 14617 if (formatfloat(v, arg, NULL, writer) == -1) 14618 return -1; 14619 return 1; 14620 } 14621 14622 arg->sign = 1; 14623 if (formatfloat(v, arg, p_str, NULL) == -1) 14624 return -1; 14625 break; 14626 14627 case 'c': 14628 { 14629 Py_UCS4 ch = formatchar(v); 14630 if (ch == (Py_UCS4) -1) 14631 return -1; 14632 if (arg->width == -1 && arg->prec == -1) { 14633 /* Fast path */ 14634 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14635 return -1; 14636 return 1; 14637 } 14638 *p_str = PyUnicode_FromOrdinal(ch); 14639 break; 14640 } 14641 14642 default: 14643 PyErr_Format(PyExc_ValueError, 14644 "unsupported format character '%c' (0x%x) " 14645 "at index %zd", 14646 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14647 (int)arg->ch, 14648 ctx->fmtpos - 1); 14649 return -1; 14650 } 14651 if (*p_str == NULL) 14652 return -1; 14653 assert (PyUnicode_Check(*p_str)); 14654 return 0; 14655 } 14656 14657 static int 14658 unicode_format_arg_output(struct unicode_formatter_t *ctx, 14659 struct unicode_format_arg_t *arg, 14660 PyObject *str) 14661 { 14662 Py_ssize_t len; 14663 enum PyUnicode_Kind kind; 14664 void *pbuf; 14665 Py_ssize_t pindex; 14666 Py_UCS4 signchar; 14667 Py_ssize_t buflen; 14668 Py_UCS4 maxchar; 14669 Py_ssize_t sublen; 14670 _PyUnicodeWriter *writer = &ctx->writer; 14671 Py_UCS4 fill; 14672 14673 fill = ' '; 14674 if (arg->sign && arg->flags & F_ZERO) 14675 fill = '0'; 14676 14677 if (PyUnicode_READY(str) == -1) 14678 return -1; 14679 14680 len = PyUnicode_GET_LENGTH(str); 14681 if ((arg->width == -1 || arg->width <= len) 14682 && (arg->prec == -1 || arg->prec >= len) 14683 && !(arg->flags & (F_SIGN | F_BLANK))) 14684 { 14685 /* Fast path */ 14686 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14687 return -1; 14688 return 0; 14689 } 14690 14691 /* Truncate the string for "s", "r" and "a" formats 14692 if the precision is set */ 14693 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14694 if (arg->prec >= 0 && len > arg->prec) 14695 len = arg->prec; 14696 } 14697 14698 /* Adjust sign and width */ 14699 kind = PyUnicode_KIND(str); 14700 pbuf = PyUnicode_DATA(str); 14701 pindex = 0; 14702 signchar = '\0'; 14703 if (arg->sign) { 14704 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14705 if (ch == '-' || ch == '+') { 14706 signchar = ch; 14707 len--; 14708 pindex++; 14709 } 14710 else if (arg->flags & F_SIGN) 14711 signchar = '+'; 14712 else if (arg->flags & F_BLANK) 14713 signchar = ' '; 14714 else 14715 arg->sign = 0; 14716 } 14717 if (arg->width < len) 14718 arg->width = len; 14719 14720 /* Prepare the writer */ 14721 maxchar = writer->maxchar; 14722 if (!(arg->flags & F_LJUST)) { 14723 if (arg->sign) { 14724 if ((arg->width-1) > len) 14725 maxchar = Py_MAX(maxchar, fill); 14726 } 14727 else { 14728 if (arg->width > len) 14729 maxchar = Py_MAX(maxchar, fill); 14730 } 14731 } 14732 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14733 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14734 maxchar = Py_MAX(maxchar, strmaxchar); 14735 } 14736 14737 buflen = arg->width; 14738 if (arg->sign && len == arg->width) 14739 buflen++; 14740 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14741 return -1; 14742 14743 /* Write the sign if needed */ 14744 if (arg->sign) { 14745 if (fill != ' ') { 14746 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14747 writer->pos += 1; 14748 } 14749 if (arg->width > len) 14750 arg->width--; 14751 } 14752 14753 /* Write the numeric prefix for "x", "X" and "o" formats 14754 if the alternate form is used. 14755 For example, write "0x" for the "%#x" format. */ 14756 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14757 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14758 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14759 if (fill != ' ') { 14760 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14761 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14762 writer->pos += 2; 14763 pindex += 2; 14764 } 14765 arg->width -= 2; 14766 if (arg->width < 0) 14767 arg->width = 0; 14768 len -= 2; 14769 } 14770 14771 /* Pad left with the fill character if needed */ 14772 if (arg->width > len && !(arg->flags & F_LJUST)) { 14773 sublen = arg->width - len; 14774 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14775 writer->pos += sublen; 14776 arg->width = len; 14777 } 14778 14779 /* If padding with spaces: write sign if needed and/or numeric prefix if 14780 the alternate form is used */ 14781 if (fill == ' ') { 14782 if (arg->sign) { 14783 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14784 writer->pos += 1; 14785 } 14786 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14787 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14788 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14789 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14790 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14791 writer->pos += 2; 14792 pindex += 2; 14793 } 14794 } 14795 14796 /* Write characters */ 14797 if (len) { 14798 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14799 str, pindex, len); 14800 writer->pos += len; 14801 } 14802 14803 /* Pad right with the fill character if needed */ 14804 if (arg->width > len) { 14805 sublen = arg->width - len; 14806 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14807 writer->pos += sublen; 14808 } 14809 return 0; 14810 } 14811 14812 /* Helper of PyUnicode_Format(): format one arg. 14813 Return 0 on success, raise an exception and return -1 on error. */ 14814 static int 14815 unicode_format_arg(struct unicode_formatter_t *ctx) 14816 { 14817 struct unicode_format_arg_t arg; 14818 PyObject *str; 14819 int ret; 14820 14821 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14822 arg.flags = 0; 14823 arg.width = -1; 14824 arg.prec = -1; 14825 arg.sign = 0; 14826 str = NULL; 14827 14828 ret = unicode_format_arg_parse(ctx, &arg); 14829 if (ret == -1) 14830 return -1; 14831 14832 ret = unicode_format_arg_format(ctx, &arg, &str); 14833 if (ret == -1) 14834 return -1; 14835 14836 if (ret != 1) { 14837 ret = unicode_format_arg_output(ctx, &arg, str); 14838 Py_DECREF(str); 14839 if (ret == -1) 14840 return -1; 14841 } 14842 14843 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14844 PyErr_SetString(PyExc_TypeError, 14845 "not all arguments converted during string formatting"); 14846 return -1; 14847 } 14848 return 0; 14849 } 14850 14851 PyObject * 14852 PyUnicode_Format(PyObject *format, PyObject *args) 14853 { 14854 struct unicode_formatter_t ctx; 14855 14856 if (format == NULL || args == NULL) { 14857 PyErr_BadInternalCall(); 14858 return NULL; 14859 } 14860 14861 if (ensure_unicode(format) < 0) 14862 return NULL; 14863 14864 ctx.fmtstr = format; 14865 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14866 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14867 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14868 ctx.fmtpos = 0; 14869 14870 _PyUnicodeWriter_Init(&ctx.writer); 14871 ctx.writer.min_length = ctx.fmtcnt + 100; 14872 ctx.writer.overallocate = 1; 14873 14874 if (PyTuple_Check(args)) { 14875 ctx.arglen = PyTuple_Size(args); 14876 ctx.argidx = 0; 14877 } 14878 else { 14879 ctx.arglen = -1; 14880 ctx.argidx = -2; 14881 } 14882 ctx.args_owned = 0; 14883 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14884 ctx.dict = args; 14885 else 14886 ctx.dict = NULL; 14887 ctx.args = args; 14888 14889 while (--ctx.fmtcnt >= 0) { 14890 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14891 Py_ssize_t nonfmtpos; 14892 14893 nonfmtpos = ctx.fmtpos++; 14894 while (ctx.fmtcnt >= 0 && 14895 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14896 ctx.fmtpos++; 14897 ctx.fmtcnt--; 14898 } 14899 if (ctx.fmtcnt < 0) { 14900 ctx.fmtpos--; 14901 ctx.writer.overallocate = 0; 14902 } 14903 14904 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14905 nonfmtpos, ctx.fmtpos) < 0) 14906 goto onError; 14907 } 14908 else { 14909 ctx.fmtpos++; 14910 if (unicode_format_arg(&ctx) == -1) 14911 goto onError; 14912 } 14913 } 14914 14915 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14916 PyErr_SetString(PyExc_TypeError, 14917 "not all arguments converted during string formatting"); 14918 goto onError; 14919 } 14920 14921 if (ctx.args_owned) { 14922 Py_DECREF(ctx.args); 14923 } 14924 return _PyUnicodeWriter_Finish(&ctx.writer); 14925 14926 onError: 14927 _PyUnicodeWriter_Dealloc(&ctx.writer); 14928 if (ctx.args_owned) { 14929 Py_DECREF(ctx.args); 14930 } 14931 return NULL; 14932 } 14933 14934 static PyObject * 14935 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14936 14937 static PyObject * 14938 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14939 { 14940 PyObject *x = NULL; 14941 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14942 char *encoding = NULL; 14943 char *errors = NULL; 14944 14945 if (type != &PyUnicode_Type) 14946 return unicode_subtype_new(type, args, kwds); 14947 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14948 kwlist, &x, &encoding, &errors)) 14949 return NULL; 14950 if (x == NULL) 14951 _Py_RETURN_UNICODE_EMPTY(); 14952 if (encoding == NULL && errors == NULL) 14953 return PyObject_Str(x); 14954 else 14955 return PyUnicode_FromEncodedObject(x, encoding, errors); 14956 } 14957 14958 static PyObject * 14959 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14960 { 14961 PyObject *unicode, *self; 14962 Py_ssize_t length, char_size; 14963 int share_wstr, share_utf8; 14964 unsigned int kind; 14965 void *data; 14966 14967 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14968 14969 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14970 if (unicode == NULL) 14971 return NULL; 14972 assert(_PyUnicode_CHECK(unicode)); 14973 if (PyUnicode_READY(unicode) == -1) { 14974 Py_DECREF(unicode); 14975 return NULL; 14976 } 14977 14978 self = type->tp_alloc(type, 0); 14979 if (self == NULL) { 14980 Py_DECREF(unicode); 14981 return NULL; 14982 } 14983 kind = PyUnicode_KIND(unicode); 14984 length = PyUnicode_GET_LENGTH(unicode); 14985 14986 _PyUnicode_LENGTH(self) = length; 14987 #ifdef Py_DEBUG 14988 _PyUnicode_HASH(self) = -1; 14989 #else 14990 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14991 #endif 14992 _PyUnicode_STATE(self).interned = 0; 14993 _PyUnicode_STATE(self).kind = kind; 14994 _PyUnicode_STATE(self).compact = 0; 14995 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14996 _PyUnicode_STATE(self).ready = 1; 14997 _PyUnicode_WSTR(self) = NULL; 14998 _PyUnicode_UTF8_LENGTH(self) = 0; 14999 _PyUnicode_UTF8(self) = NULL; 15000 _PyUnicode_WSTR_LENGTH(self) = 0; 15001 _PyUnicode_DATA_ANY(self) = NULL; 15002 15003 share_utf8 = 0; 15004 share_wstr = 0; 15005 if (kind == PyUnicode_1BYTE_KIND) { 15006 char_size = 1; 15007 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 15008 share_utf8 = 1; 15009 } 15010 else if (kind == PyUnicode_2BYTE_KIND) { 15011 char_size = 2; 15012 if (sizeof(wchar_t) == 2) 15013 share_wstr = 1; 15014 } 15015 else { 15016 assert(kind == PyUnicode_4BYTE_KIND); 15017 char_size = 4; 15018 if (sizeof(wchar_t) == 4) 15019 share_wstr = 1; 15020 } 15021 15022 /* Ensure we won't overflow the length. */ 15023 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 15024 PyErr_NoMemory(); 15025 goto onError; 15026 } 15027 data = PyObject_MALLOC((length + 1) * char_size); 15028 if (data == NULL) { 15029 PyErr_NoMemory(); 15030 goto onError; 15031 } 15032 15033 _PyUnicode_DATA_ANY(self) = data; 15034 if (share_utf8) { 15035 _PyUnicode_UTF8_LENGTH(self) = length; 15036 _PyUnicode_UTF8(self) = data; 15037 } 15038 if (share_wstr) { 15039 _PyUnicode_WSTR_LENGTH(self) = length; 15040 _PyUnicode_WSTR(self) = (wchar_t *)data; 15041 } 15042 15043 memcpy(data, PyUnicode_DATA(unicode), 15044 kind * (length + 1)); 15045 assert(_PyUnicode_CheckConsistency(self, 1)); 15046 #ifdef Py_DEBUG 15047 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15048 #endif 15049 Py_DECREF(unicode); 15050 return self; 15051 15052 onError: 15053 Py_DECREF(unicode); 15054 Py_DECREF(self); 15055 return NULL; 15056 } 15057 15058 PyDoc_STRVAR(unicode_doc, 15059 "str(object='') -> str\n\ 15060 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 15061 \n\ 15062 Create a new string object from the given object. If encoding or\n\ 15063 errors is specified, then the object must expose a data buffer\n\ 15064 that will be decoded using the given encoding and error handler.\n\ 15065 Otherwise, returns the result of object.__str__() (if defined)\n\ 15066 or repr(object).\n\ 15067 encoding defaults to sys.getdefaultencoding().\n\ 15068 errors defaults to 'strict'."); 15069 15070 static PyObject *unicode_iter(PyObject *seq); 15071 15072 PyTypeObject PyUnicode_Type = { 15073 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15074 "str", /* tp_name */ 15075 sizeof(PyUnicodeObject), /* tp_size */ 15076 0, /* tp_itemsize */ 15077 /* Slots */ 15078 (destructor)unicode_dealloc, /* tp_dealloc */ 15079 0, /* tp_print */ 15080 0, /* tp_getattr */ 15081 0, /* tp_setattr */ 15082 0, /* tp_reserved */ 15083 unicode_repr, /* tp_repr */ 15084 &unicode_as_number, /* tp_as_number */ 15085 &unicode_as_sequence, /* tp_as_sequence */ 15086 &unicode_as_mapping, /* tp_as_mapping */ 15087 (hashfunc) unicode_hash, /* tp_hash*/ 15088 0, /* tp_call*/ 15089 (reprfunc) unicode_str, /* tp_str */ 15090 PyObject_GenericGetAttr, /* tp_getattro */ 15091 0, /* tp_setattro */ 15092 0, /* tp_as_buffer */ 15093 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 15094 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 15095 unicode_doc, /* tp_doc */ 15096 0, /* tp_traverse */ 15097 0, /* tp_clear */ 15098 PyUnicode_RichCompare, /* tp_richcompare */ 15099 0, /* tp_weaklistoffset */ 15100 unicode_iter, /* tp_iter */ 15101 0, /* tp_iternext */ 15102 unicode_methods, /* tp_methods */ 15103 0, /* tp_members */ 15104 0, /* tp_getset */ 15105 &PyBaseObject_Type, /* tp_base */ 15106 0, /* tp_dict */ 15107 0, /* tp_descr_get */ 15108 0, /* tp_descr_set */ 15109 0, /* tp_dictoffset */ 15110 0, /* tp_init */ 15111 0, /* tp_alloc */ 15112 unicode_new, /* tp_new */ 15113 PyObject_Del, /* tp_free */ 15114 }; 15115 15116 /* Initialize the Unicode implementation */ 15117 15118 int _PyUnicode_Init(void) 15119 { 15120 /* XXX - move this array to unicodectype.c ? */ 15121 Py_UCS2 linebreak[] = { 15122 0x000A, /* LINE FEED */ 15123 0x000D, /* CARRIAGE RETURN */ 15124 0x001C, /* FILE SEPARATOR */ 15125 0x001D, /* GROUP SEPARATOR */ 15126 0x001E, /* RECORD SEPARATOR */ 15127 0x0085, /* NEXT LINE */ 15128 0x2028, /* LINE SEPARATOR */ 15129 0x2029, /* PARAGRAPH SEPARATOR */ 15130 }; 15131 15132 /* Init the implementation */ 15133 _Py_INCREF_UNICODE_EMPTY(); 15134 if (!unicode_empty) 15135 Py_FatalError("Can't create empty string"); 15136 Py_DECREF(unicode_empty); 15137 15138 if (PyType_Ready(&PyUnicode_Type) < 0) 15139 Py_FatalError("Can't initialize 'unicode'"); 15140 15141 /* initialize the linebreak bloom filter */ 15142 bloom_linebreak = make_bloom_mask( 15143 PyUnicode_2BYTE_KIND, linebreak, 15144 Py_ARRAY_LENGTH(linebreak)); 15145 15146 if (PyType_Ready(&EncodingMapType) < 0) 15147 Py_FatalError("Can't initialize encoding map type"); 15148 15149 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15150 Py_FatalError("Can't initialize field name iterator type"); 15151 15152 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15153 Py_FatalError("Can't initialize formatter iter type"); 15154 15155 return 0; 15156 } 15157 15158 /* Finalize the Unicode implementation */ 15159 15160 int 15161 PyUnicode_ClearFreeList(void) 15162 { 15163 return 0; 15164 } 15165 15166 void 15167 _PyUnicode_Fini(void) 15168 { 15169 int i; 15170 15171 Py_CLEAR(unicode_empty); 15172 15173 for (i = 0; i < 256; i++) 15174 Py_CLEAR(unicode_latin1[i]); 15175 _PyUnicode_ClearStaticStrings(); 15176 (void)PyUnicode_ClearFreeList(); 15177 } 15178 15179 void 15180 PyUnicode_InternInPlace(PyObject **p) 15181 { 15182 PyObject *s = *p; 15183 PyObject *t; 15184 #ifdef Py_DEBUG 15185 assert(s != NULL); 15186 assert(_PyUnicode_CHECK(s)); 15187 #else 15188 if (s == NULL || !PyUnicode_Check(s)) 15189 return; 15190 #endif 15191 /* If it's a subclass, we don't really know what putting 15192 it in the interned dict might do. */ 15193 if (!PyUnicode_CheckExact(s)) 15194 return; 15195 if (PyUnicode_CHECK_INTERNED(s)) 15196 return; 15197 if (interned == NULL) { 15198 interned = PyDict_New(); 15199 if (interned == NULL) { 15200 PyErr_Clear(); /* Don't leave an exception */ 15201 return; 15202 } 15203 } 15204 Py_ALLOW_RECURSION 15205 t = PyDict_SetDefault(interned, s, s); 15206 Py_END_ALLOW_RECURSION 15207 if (t == NULL) { 15208 PyErr_Clear(); 15209 return; 15210 } 15211 if (t != s) { 15212 Py_INCREF(t); 15213 Py_SETREF(*p, t); 15214 return; 15215 } 15216 /* The two references in interned are not counted by refcnt. 15217 The deallocator will take care of this */ 15218 Py_REFCNT(s) -= 2; 15219 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15220 } 15221 15222 void 15223 PyUnicode_InternImmortal(PyObject **p) 15224 { 15225 PyUnicode_InternInPlace(p); 15226 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15227 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15228 Py_INCREF(*p); 15229 } 15230 } 15231 15232 PyObject * 15233 PyUnicode_InternFromString(const char *cp) 15234 { 15235 PyObject *s = PyUnicode_FromString(cp); 15236 if (s == NULL) 15237 return NULL; 15238 PyUnicode_InternInPlace(&s); 15239 return s; 15240 } 15241 15242 void 15243 _Py_ReleaseInternedUnicodeStrings(void) 15244 { 15245 PyObject *keys; 15246 PyObject *s; 15247 Py_ssize_t i, n; 15248 Py_ssize_t immortal_size = 0, mortal_size = 0; 15249 15250 if (interned == NULL || !PyDict_Check(interned)) 15251 return; 15252 keys = PyDict_Keys(interned); 15253 if (keys == NULL || !PyList_Check(keys)) { 15254 PyErr_Clear(); 15255 return; 15256 } 15257 15258 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15259 detector, interned unicode strings are not forcibly deallocated; 15260 rather, we give them their stolen references back, and then clear 15261 and DECREF the interned dict. */ 15262 15263 n = PyList_GET_SIZE(keys); 15264 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15265 n); 15266 for (i = 0; i < n; i++) { 15267 s = PyList_GET_ITEM(keys, i); 15268 if (PyUnicode_READY(s) == -1) { 15269 assert(0 && "could not ready string"); 15270 fprintf(stderr, "could not ready string\n"); 15271 } 15272 switch (PyUnicode_CHECK_INTERNED(s)) { 15273 case SSTATE_NOT_INTERNED: 15274 /* XXX Shouldn't happen */ 15275 break; 15276 case SSTATE_INTERNED_IMMORTAL: 15277 Py_REFCNT(s) += 1; 15278 immortal_size += PyUnicode_GET_LENGTH(s); 15279 break; 15280 case SSTATE_INTERNED_MORTAL: 15281 Py_REFCNT(s) += 2; 15282 mortal_size += PyUnicode_GET_LENGTH(s); 15283 break; 15284 default: 15285 Py_FatalError("Inconsistent interned string state."); 15286 } 15287 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15288 } 15289 fprintf(stderr, "total size of all interned strings: " 15290 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15291 "mortal/immortal\n", mortal_size, immortal_size); 15292 Py_DECREF(keys); 15293 PyDict_Clear(interned); 15294 Py_CLEAR(interned); 15295 } 15296 15297 15298 /********************* Unicode Iterator **************************/ 15299 15300 typedef struct { 15301 PyObject_HEAD 15302 Py_ssize_t it_index; 15303 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15304 } unicodeiterobject; 15305 15306 static void 15307 unicodeiter_dealloc(unicodeiterobject *it) 15308 { 15309 _PyObject_GC_UNTRACK(it); 15310 Py_XDECREF(it->it_seq); 15311 PyObject_GC_Del(it); 15312 } 15313 15314 static int 15315 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15316 { 15317 Py_VISIT(it->it_seq); 15318 return 0; 15319 } 15320 15321 static PyObject * 15322 unicodeiter_next(unicodeiterobject *it) 15323 { 15324 PyObject *seq, *item; 15325 15326 assert(it != NULL); 15327 seq = it->it_seq; 15328 if (seq == NULL) 15329 return NULL; 15330 assert(_PyUnicode_CHECK(seq)); 15331 15332 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15333 int kind = PyUnicode_KIND(seq); 15334 void *data = PyUnicode_DATA(seq); 15335 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15336 item = PyUnicode_FromOrdinal(chr); 15337 if (item != NULL) 15338 ++it->it_index; 15339 return item; 15340 } 15341 15342 it->it_seq = NULL; 15343 Py_DECREF(seq); 15344 return NULL; 15345 } 15346 15347 static PyObject * 15348 unicodeiter_len(unicodeiterobject *it) 15349 { 15350 Py_ssize_t len = 0; 15351 if (it->it_seq) 15352 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15353 return PyLong_FromSsize_t(len); 15354 } 15355 15356 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15357 15358 static PyObject * 15359 unicodeiter_reduce(unicodeiterobject *it) 15360 { 15361 if (it->it_seq != NULL) { 15362 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15363 it->it_seq, it->it_index); 15364 } else { 15365 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15366 if (u == NULL) 15367 return NULL; 15368 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15369 } 15370 } 15371 15372 PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15373 15374 static PyObject * 15375 unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15376 { 15377 Py_ssize_t index = PyLong_AsSsize_t(state); 15378 if (index == -1 && PyErr_Occurred()) 15379 return NULL; 15380 if (it->it_seq != NULL) { 15381 if (index < 0) 15382 index = 0; 15383 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15384 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15385 it->it_index = index; 15386 } 15387 Py_RETURN_NONE; 15388 } 15389 15390 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15391 15392 static PyMethodDef unicodeiter_methods[] = { 15393 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15394 length_hint_doc}, 15395 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15396 reduce_doc}, 15397 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15398 setstate_doc}, 15399 {NULL, NULL} /* sentinel */ 15400 }; 15401 15402 PyTypeObject PyUnicodeIter_Type = { 15403 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15404 "str_iterator", /* tp_name */ 15405 sizeof(unicodeiterobject), /* tp_basicsize */ 15406 0, /* tp_itemsize */ 15407 /* methods */ 15408 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15409 0, /* tp_print */ 15410 0, /* tp_getattr */ 15411 0, /* tp_setattr */ 15412 0, /* tp_reserved */ 15413 0, /* tp_repr */ 15414 0, /* tp_as_number */ 15415 0, /* tp_as_sequence */ 15416 0, /* tp_as_mapping */ 15417 0, /* tp_hash */ 15418 0, /* tp_call */ 15419 0, /* tp_str */ 15420 PyObject_GenericGetAttr, /* tp_getattro */ 15421 0, /* tp_setattro */ 15422 0, /* tp_as_buffer */ 15423 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15424 0, /* tp_doc */ 15425 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15426 0, /* tp_clear */ 15427 0, /* tp_richcompare */ 15428 0, /* tp_weaklistoffset */ 15429 PyObject_SelfIter, /* tp_iter */ 15430 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15431 unicodeiter_methods, /* tp_methods */ 15432 0, 15433 }; 15434 15435 static PyObject * 15436 unicode_iter(PyObject *seq) 15437 { 15438 unicodeiterobject *it; 15439 15440 if (!PyUnicode_Check(seq)) { 15441 PyErr_BadInternalCall(); 15442 return NULL; 15443 } 15444 if (PyUnicode_READY(seq) == -1) 15445 return NULL; 15446 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15447 if (it == NULL) 15448 return NULL; 15449 it->it_index = 0; 15450 Py_INCREF(seq); 15451 it->it_seq = seq; 15452 _PyObject_GC_TRACK(it); 15453 return (PyObject *)it; 15454 } 15455 15456 15457 size_t 15458 Py_UNICODE_strlen(const Py_UNICODE *u) 15459 { 15460 int res = 0; 15461 while(*u++) 15462 res++; 15463 return res; 15464 } 15465 15466 Py_UNICODE* 15467 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15468 { 15469 Py_UNICODE *u = s1; 15470 while ((*u++ = *s2++)); 15471 return s1; 15472 } 15473 15474 Py_UNICODE* 15475 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15476 { 15477 Py_UNICODE *u = s1; 15478 while ((*u++ = *s2++)) 15479 if (n-- == 0) 15480 break; 15481 return s1; 15482 } 15483 15484 Py_UNICODE* 15485 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15486 { 15487 Py_UNICODE *u1 = s1; 15488 u1 += Py_UNICODE_strlen(u1); 15489 Py_UNICODE_strcpy(u1, s2); 15490 return s1; 15491 } 15492 15493 int 15494 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15495 { 15496 while (*s1 && *s2 && *s1 == *s2) 15497 s1++, s2++; 15498 if (*s1 && *s2) 15499 return (*s1 < *s2) ? -1 : +1; 15500 if (*s1) 15501 return 1; 15502 if (*s2) 15503 return -1; 15504 return 0; 15505 } 15506 15507 int 15508 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15509 { 15510 Py_UNICODE u1, u2; 15511 for (; n != 0; n--) { 15512 u1 = *s1; 15513 u2 = *s2; 15514 if (u1 != u2) 15515 return (u1 < u2) ? -1 : +1; 15516 if (u1 == '\0') 15517 return 0; 15518 s1++; 15519 s2++; 15520 } 15521 return 0; 15522 } 15523 15524 Py_UNICODE* 15525 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15526 { 15527 const Py_UNICODE *p; 15528 for (p = s; *p; p++) 15529 if (*p == c) 15530 return (Py_UNICODE*)p; 15531 return NULL; 15532 } 15533 15534 Py_UNICODE* 15535 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15536 { 15537 const Py_UNICODE *p; 15538 p = s + Py_UNICODE_strlen(s); 15539 while (p != s) { 15540 p--; 15541 if (*p == c) 15542 return (Py_UNICODE*)p; 15543 } 15544 return NULL; 15545 } 15546 15547 Py_UNICODE* 15548 PyUnicode_AsUnicodeCopy(PyObject *unicode) 15549 { 15550 Py_UNICODE *u, *copy; 15551 Py_ssize_t len, size; 15552 15553 if (!PyUnicode_Check(unicode)) { 15554 PyErr_BadArgument(); 15555 return NULL; 15556 } 15557 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15558 if (u == NULL) 15559 return NULL; 15560 /* Ensure we won't overflow the size. */ 15561 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15562 PyErr_NoMemory(); 15563 return NULL; 15564 } 15565 size = len + 1; /* copy the null character */ 15566 size *= sizeof(Py_UNICODE); 15567 copy = PyMem_Malloc(size); 15568 if (copy == NULL) { 15569 PyErr_NoMemory(); 15570 return NULL; 15571 } 15572 memcpy(copy, u, size); 15573 return copy; 15574 } 15575 15576 /* A _string module, to export formatter_parser and formatter_field_name_split 15577 to the string.Formatter class implemented in Python. */ 15578 15579 static PyMethodDef _string_methods[] = { 15580 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15581 METH_O, PyDoc_STR("split the argument as a field name")}, 15582 {"formatter_parser", (PyCFunction) formatter_parser, 15583 METH_O, PyDoc_STR("parse the argument as a format string")}, 15584 {NULL, NULL} 15585 }; 15586 15587 static struct PyModuleDef _string_module = { 15588 PyModuleDef_HEAD_INIT, 15589 "_string", 15590 PyDoc_STR("string helper module"), 15591 0, 15592 _string_methods, 15593 NULL, 15594 NULL, 15595 NULL, 15596 NULL 15597 }; 15598 15599 PyMODINIT_FUNC 15600 PyInit__string(void) 15601 { 15602 return PyModule_Create(&_string_module); 15603 } 15604 15605 15606 #ifdef __cplusplus 15607 } 15608 #endif 15609