1 /* 2 3 Unicode implementation based on original code by Fredrik Lundh, 4 modified by Marc-Andre Lemburg <mal (at) lemburg.com>. 5 6 Major speed upgrades to the method implementations at the Reykjavik 7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9 Copyright (c) Corporation for National Research Initiatives. 10 11 -------------------------------------------------------------------- 12 The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17 By obtaining, using, and/or copying this software and/or its 18 associated documentation, you agree that you have read, understood, 19 and will comply with the following terms and conditions: 20 21 Permission to use, copy, modify, and distribute this software and its 22 associated documentation for any purpose and without fee is hereby 23 granted, provided that the above copyright notice appears in all 24 copies, and that both that copyright notice and this permission notice 25 appear in supporting documentation, and that the name of Secret Labs 26 AB or the author not be used in advertising or publicity pertaining to 27 distribution of the software without specific, written prior 28 permission. 29 30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37 -------------------------------------------------------------------- 38 39 */ 40 41 #define PY_SSIZE_T_CLEAN 42 #include "Python.h" 43 #include "internal/pystate.h" 44 #include "ucnhash.h" 45 #include "bytes_methods.h" 46 #include "stringlib/eq.h" 47 48 #ifdef MS_WINDOWS 49 #include <windows.h> 50 #endif 51 52 /*[clinic input] 53 class str "PyObject *" "&PyUnicode_Type" 54 [clinic start generated code]*/ 55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/ 56 57 /*[python input] 58 class Py_UCS4_converter(CConverter): 59 type = 'Py_UCS4' 60 converter = 'convert_uc' 61 62 def converter_init(self): 63 if self.default is not unspecified: 64 self.c_default = ascii(self.default) 65 if len(self.c_default) > 4 or self.c_default[0] != "'": 66 self.c_default = hex(ord(self.default)) 67 68 [python start generated code]*/ 69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/ 70 71 /* --- Globals ------------------------------------------------------------ 72 73 NOTE: In the interpreter's initialization phase, some globals are currently 74 initialized dynamically as needed. In the process Unicode objects may 75 be created before the Unicode type is ready. 76 77 */ 78 79 80 #ifdef __cplusplus 81 extern "C" { 82 #endif 83 84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 85 #define MAX_UNICODE 0x10ffff 86 87 #ifdef Py_DEBUG 88 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 89 #else 90 # define _PyUnicode_CHECK(op) PyUnicode_Check(op) 91 #endif 92 93 #define _PyUnicode_UTF8(op) \ 94 (((PyCompactUnicodeObject*)(op))->utf8) 95 #define PyUnicode_UTF8(op) \ 96 (assert(_PyUnicode_CHECK(op)), \ 97 assert(PyUnicode_IS_READY(op)), \ 98 PyUnicode_IS_COMPACT_ASCII(op) ? \ 99 ((char*)((PyASCIIObject*)(op) + 1)) : \ 100 _PyUnicode_UTF8(op)) 101 #define _PyUnicode_UTF8_LENGTH(op) \ 102 (((PyCompactUnicodeObject*)(op))->utf8_length) 103 #define PyUnicode_UTF8_LENGTH(op) \ 104 (assert(_PyUnicode_CHECK(op)), \ 105 assert(PyUnicode_IS_READY(op)), \ 106 PyUnicode_IS_COMPACT_ASCII(op) ? \ 107 ((PyASCIIObject*)(op))->length : \ 108 _PyUnicode_UTF8_LENGTH(op)) 109 #define _PyUnicode_WSTR(op) \ 110 (((PyASCIIObject*)(op))->wstr) 111 #define _PyUnicode_WSTR_LENGTH(op) \ 112 (((PyCompactUnicodeObject*)(op))->wstr_length) 113 #define _PyUnicode_LENGTH(op) \ 114 (((PyASCIIObject *)(op))->length) 115 #define _PyUnicode_STATE(op) \ 116 (((PyASCIIObject *)(op))->state) 117 #define _PyUnicode_HASH(op) \ 118 (((PyASCIIObject *)(op))->hash) 119 #define _PyUnicode_KIND(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 ((PyASCIIObject *)(op))->state.kind) 122 #define _PyUnicode_GET_LENGTH(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 ((PyASCIIObject *)(op))->length) 125 #define _PyUnicode_DATA_ANY(op) \ 126 (((PyUnicodeObject*)(op))->data.any) 127 128 #undef PyUnicode_READY 129 #define PyUnicode_READY(op) \ 130 (assert(_PyUnicode_CHECK(op)), \ 131 (PyUnicode_IS_READY(op) ? \ 132 0 : \ 133 _PyUnicode_Ready(op))) 134 135 #define _PyUnicode_SHARE_UTF8(op) \ 136 (assert(_PyUnicode_CHECK(op)), \ 137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 139 #define _PyUnicode_SHARE_WSTR(op) \ 140 (assert(_PyUnicode_CHECK(op)), \ 141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 142 143 /* true if the Unicode object has an allocated UTF-8 memory block 144 (not shared with other data) */ 145 #define _PyUnicode_HAS_UTF8_MEMORY(op) \ 146 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 147 && _PyUnicode_UTF8(op) \ 148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 149 150 /* true if the Unicode object has an allocated wstr memory block 151 (not shared with other data) */ 152 #define _PyUnicode_HAS_WSTR_MEMORY(op) \ 153 ((_PyUnicode_WSTR(op) && \ 154 (!PyUnicode_IS_READY(op) || \ 155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 156 157 /* Generic helper macro to convert characters of different types. 158 from_type and to_type have to be valid type names, begin and end 159 are pointers to the source characters which should be of type 160 "from_type *". to is a pointer of type "to_type *" and points to the 161 buffer where the result characters are written to. */ 162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 163 do { \ 164 to_type *_to = (to_type *)(to); \ 165 const from_type *_iter = (from_type *)(begin); \ 166 const from_type *_end = (from_type *)(end); \ 167 Py_ssize_t n = (_end) - (_iter); \ 168 const from_type *_unrolled_end = \ 169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 170 while (_iter < (_unrolled_end)) { \ 171 _to[0] = (to_type) _iter[0]; \ 172 _to[1] = (to_type) _iter[1]; \ 173 _to[2] = (to_type) _iter[2]; \ 174 _to[3] = (to_type) _iter[3]; \ 175 _iter += 4; _to += 4; \ 176 } \ 177 while (_iter < (_end)) \ 178 *_to++ = (to_type) *_iter++; \ 179 } while (0) 180 181 #ifdef MS_WINDOWS 182 /* On Windows, overallocate by 50% is the best factor */ 183 # define OVERALLOCATE_FACTOR 2 184 #else 185 /* On Linux, overallocate by 25% is the best factor */ 186 # define OVERALLOCATE_FACTOR 4 187 #endif 188 189 /* This dictionary holds all interned unicode strings. Note that references 190 to strings in this dictionary are *not* counted in the string's ob_refcnt. 191 When the interned string reaches a refcnt of 0 the string deallocation 192 function will delete the reference from this dictionary. 193 194 Another way to look at this is that to say that the actual reference 195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 196 */ 197 static PyObject *interned = NULL; 198 199 /* The empty Unicode object is shared to improve performance. */ 200 static PyObject *unicode_empty = NULL; 201 202 #define _Py_INCREF_UNICODE_EMPTY() \ 203 do { \ 204 if (unicode_empty != NULL) \ 205 Py_INCREF(unicode_empty); \ 206 else { \ 207 unicode_empty = PyUnicode_New(0, 0); \ 208 if (unicode_empty != NULL) { \ 209 Py_INCREF(unicode_empty); \ 210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 211 } \ 212 } \ 213 } while (0) 214 215 #define _Py_RETURN_UNICODE_EMPTY() \ 216 do { \ 217 _Py_INCREF_UNICODE_EMPTY(); \ 218 return unicode_empty; \ 219 } while (0) 220 221 #define FILL(kind, data, value, start, length) \ 222 do { \ 223 assert(0 <= start); \ 224 assert(kind != PyUnicode_WCHAR_KIND); \ 225 switch (kind) { \ 226 case PyUnicode_1BYTE_KIND: { \ 227 assert(value <= 0xff); \ 228 Py_UCS1 ch = (unsigned char)value; \ 229 Py_UCS1 *to = (Py_UCS1 *)data + start; \ 230 memset(to, ch, length); \ 231 break; \ 232 } \ 233 case PyUnicode_2BYTE_KIND: { \ 234 assert(value <= 0xffff); \ 235 Py_UCS2 ch = (Py_UCS2)value; \ 236 Py_UCS2 *to = (Py_UCS2 *)data + start; \ 237 const Py_UCS2 *end = to + length; \ 238 for (; to < end; ++to) *to = ch; \ 239 break; \ 240 } \ 241 case PyUnicode_4BYTE_KIND: { \ 242 assert(value <= MAX_UNICODE); \ 243 Py_UCS4 ch = value; \ 244 Py_UCS4 * to = (Py_UCS4 *)data + start; \ 245 const Py_UCS4 *end = to + length; \ 246 for (; to < end; ++to) *to = ch; \ 247 break; \ 248 } \ 249 default: Py_UNREACHABLE(); \ 250 } \ 251 } while (0) 252 253 254 /* Forward declaration */ 255 static inline int 256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 257 258 /* List of static strings. */ 259 static _Py_Identifier *static_strings = NULL; 260 261 /* Single character Unicode strings in the Latin-1 range are being 262 shared as well. */ 263 static PyObject *unicode_latin1[256] = {NULL}; 264 265 /* Fast detection of the most frequent whitespace characters */ 266 const unsigned char _Py_ascii_whitespace[] = { 267 0, 0, 0, 0, 0, 0, 0, 0, 268 /* case 0x0009: * CHARACTER TABULATION */ 269 /* case 0x000A: * LINE FEED */ 270 /* case 0x000B: * LINE TABULATION */ 271 /* case 0x000C: * FORM FEED */ 272 /* case 0x000D: * CARRIAGE RETURN */ 273 0, 1, 1, 1, 1, 1, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 /* case 0x001C: * FILE SEPARATOR */ 276 /* case 0x001D: * GROUP SEPARATOR */ 277 /* case 0x001E: * RECORD SEPARATOR */ 278 /* case 0x001F: * UNIT SEPARATOR */ 279 0, 0, 0, 0, 1, 1, 1, 1, 280 /* case 0x0020: * SPACE */ 281 1, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0, 292 0, 0, 0, 0, 0, 0, 0, 0, 293 0, 0, 0, 0, 0, 0, 0, 0 294 }; 295 296 /* forward */ 297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 298 static PyObject* get_latin1_char(unsigned char ch); 299 static int unicode_modifiable(PyObject *unicode); 300 301 302 static PyObject * 303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 304 static PyObject * 305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 306 static PyObject * 307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 308 309 static PyObject * 310 unicode_encode_call_errorhandler(const char *errors, 311 PyObject **errorHandler,const char *encoding, const char *reason, 312 PyObject *unicode, PyObject **exceptionObject, 313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 314 315 static void 316 raise_encode_exception(PyObject **exceptionObject, 317 const char *encoding, 318 PyObject *unicode, 319 Py_ssize_t startpos, Py_ssize_t endpos, 320 const char *reason); 321 322 /* Same for linebreaks */ 323 static const unsigned char ascii_linebreak[] = { 324 0, 0, 0, 0, 0, 0, 0, 0, 325 /* 0x000A, * LINE FEED */ 326 /* 0x000B, * LINE TABULATION */ 327 /* 0x000C, * FORM FEED */ 328 /* 0x000D, * CARRIAGE RETURN */ 329 0, 0, 1, 1, 1, 1, 0, 0, 330 0, 0, 0, 0, 0, 0, 0, 0, 331 /* 0x001C, * FILE SEPARATOR */ 332 /* 0x001D, * GROUP SEPARATOR */ 333 /* 0x001E, * RECORD SEPARATOR */ 334 0, 0, 0, 0, 1, 1, 1, 0, 335 0, 0, 0, 0, 0, 0, 0, 0, 336 0, 0, 0, 0, 0, 0, 0, 0, 337 0, 0, 0, 0, 0, 0, 0, 0, 338 0, 0, 0, 0, 0, 0, 0, 0, 339 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 0, 0, 0, 0, 343 0, 0, 0, 0, 0, 0, 0, 0, 344 0, 0, 0, 0, 0, 0, 0, 0, 345 0, 0, 0, 0, 0, 0, 0, 0, 346 0, 0, 0, 0, 0, 0, 0, 0, 347 0, 0, 0, 0, 0, 0, 0, 0 348 }; 349 350 static int convert_uc(PyObject *obj, void *addr); 351 352 #include "clinic/unicodeobject.c.h" 353 354 typedef enum { 355 _Py_ERROR_UNKNOWN=0, 356 _Py_ERROR_STRICT, 357 _Py_ERROR_SURROGATEESCAPE, 358 _Py_ERROR_REPLACE, 359 _Py_ERROR_IGNORE, 360 _Py_ERROR_BACKSLASHREPLACE, 361 _Py_ERROR_SURROGATEPASS, 362 _Py_ERROR_XMLCHARREFREPLACE, 363 _Py_ERROR_OTHER 364 } _Py_error_handler; 365 366 static _Py_error_handler 367 get_error_handler(const char *errors) 368 { 369 if (errors == NULL || strcmp(errors, "strict") == 0) { 370 return _Py_ERROR_STRICT; 371 } 372 if (strcmp(errors, "surrogateescape") == 0) { 373 return _Py_ERROR_SURROGATEESCAPE; 374 } 375 if (strcmp(errors, "replace") == 0) { 376 return _Py_ERROR_REPLACE; 377 } 378 if (strcmp(errors, "ignore") == 0) { 379 return _Py_ERROR_IGNORE; 380 } 381 if (strcmp(errors, "backslashreplace") == 0) { 382 return _Py_ERROR_BACKSLASHREPLACE; 383 } 384 if (strcmp(errors, "surrogatepass") == 0) { 385 return _Py_ERROR_SURROGATEPASS; 386 } 387 if (strcmp(errors, "xmlcharrefreplace") == 0) { 388 return _Py_ERROR_XMLCHARREFREPLACE; 389 } 390 return _Py_ERROR_OTHER; 391 } 392 393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API. 394 This function is kept for backward compatibility with the old API. */ 395 Py_UNICODE 396 PyUnicode_GetMax(void) 397 { 398 #ifdef Py_UNICODE_WIDE 399 return 0x10FFFF; 400 #else 401 /* This is actually an illegal character, so it should 402 not be passed to unichr. */ 403 return 0xFFFF; 404 #endif 405 } 406 407 #ifdef Py_DEBUG 408 int 409 _PyUnicode_CheckConsistency(PyObject *op, int check_content) 410 { 411 PyASCIIObject *ascii; 412 unsigned int kind; 413 414 assert(PyUnicode_Check(op)); 415 416 ascii = (PyASCIIObject *)op; 417 kind = ascii->state.kind; 418 419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 420 assert(kind == PyUnicode_1BYTE_KIND); 421 assert(ascii->state.ready == 1); 422 } 423 else { 424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 425 void *data; 426 427 if (ascii->state.compact == 1) { 428 data = compact + 1; 429 assert(kind == PyUnicode_1BYTE_KIND 430 || kind == PyUnicode_2BYTE_KIND 431 || kind == PyUnicode_4BYTE_KIND); 432 assert(ascii->state.ascii == 0); 433 assert(ascii->state.ready == 1); 434 assert (compact->utf8 != data); 435 } 436 else { 437 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 438 439 data = unicode->data.any; 440 if (kind == PyUnicode_WCHAR_KIND) { 441 assert(ascii->length == 0); 442 assert(ascii->hash == -1); 443 assert(ascii->state.compact == 0); 444 assert(ascii->state.ascii == 0); 445 assert(ascii->state.ready == 0); 446 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 447 assert(ascii->wstr != NULL); 448 assert(data == NULL); 449 assert(compact->utf8 == NULL); 450 } 451 else { 452 assert(kind == PyUnicode_1BYTE_KIND 453 || kind == PyUnicode_2BYTE_KIND 454 || kind == PyUnicode_4BYTE_KIND); 455 assert(ascii->state.compact == 0); 456 assert(ascii->state.ready == 1); 457 assert(data != NULL); 458 if (ascii->state.ascii) { 459 assert (compact->utf8 == data); 460 assert (compact->utf8_length == ascii->length); 461 } 462 else 463 assert (compact->utf8 != data); 464 } 465 } 466 if (kind != PyUnicode_WCHAR_KIND) { 467 if ( 468 #if SIZEOF_WCHAR_T == 2 469 kind == PyUnicode_2BYTE_KIND 470 #else 471 kind == PyUnicode_4BYTE_KIND 472 #endif 473 ) 474 { 475 assert(ascii->wstr == data); 476 assert(compact->wstr_length == ascii->length); 477 } else 478 assert(ascii->wstr != data); 479 } 480 481 if (compact->utf8 == NULL) 482 assert(compact->utf8_length == 0); 483 if (ascii->wstr == NULL) 484 assert(compact->wstr_length == 0); 485 } 486 /* check that the best kind is used */ 487 if (check_content && kind != PyUnicode_WCHAR_KIND) 488 { 489 Py_ssize_t i; 490 Py_UCS4 maxchar = 0; 491 void *data; 492 Py_UCS4 ch; 493 494 data = PyUnicode_DATA(ascii); 495 for (i=0; i < ascii->length; i++) 496 { 497 ch = PyUnicode_READ(kind, data, i); 498 if (ch > maxchar) 499 maxchar = ch; 500 } 501 if (kind == PyUnicode_1BYTE_KIND) { 502 if (ascii->state.ascii == 0) { 503 assert(maxchar >= 128); 504 assert(maxchar <= 255); 505 } 506 else 507 assert(maxchar < 128); 508 } 509 else if (kind == PyUnicode_2BYTE_KIND) { 510 assert(maxchar >= 0x100); 511 assert(maxchar <= 0xFFFF); 512 } 513 else { 514 assert(maxchar >= 0x10000); 515 assert(maxchar <= MAX_UNICODE); 516 } 517 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 518 } 519 return 1; 520 } 521 #endif 522 523 static PyObject* 524 unicode_result_wchar(PyObject *unicode) 525 { 526 #ifndef Py_DEBUG 527 Py_ssize_t len; 528 529 len = _PyUnicode_WSTR_LENGTH(unicode); 530 if (len == 0) { 531 Py_DECREF(unicode); 532 _Py_RETURN_UNICODE_EMPTY(); 533 } 534 535 if (len == 1) { 536 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 537 if ((Py_UCS4)ch < 256) { 538 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 539 Py_DECREF(unicode); 540 return latin1_char; 541 } 542 } 543 544 if (_PyUnicode_Ready(unicode) < 0) { 545 Py_DECREF(unicode); 546 return NULL; 547 } 548 #else 549 assert(Py_REFCNT(unicode) == 1); 550 551 /* don't make the result ready in debug mode to ensure that the caller 552 makes the string ready before using it */ 553 assert(_PyUnicode_CheckConsistency(unicode, 1)); 554 #endif 555 return unicode; 556 } 557 558 static PyObject* 559 unicode_result_ready(PyObject *unicode) 560 { 561 Py_ssize_t length; 562 563 length = PyUnicode_GET_LENGTH(unicode); 564 if (length == 0) { 565 if (unicode != unicode_empty) { 566 Py_DECREF(unicode); 567 _Py_RETURN_UNICODE_EMPTY(); 568 } 569 return unicode_empty; 570 } 571 572 if (length == 1) { 573 void *data = PyUnicode_DATA(unicode); 574 int kind = PyUnicode_KIND(unicode); 575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 576 if (ch < 256) { 577 PyObject *latin1_char = unicode_latin1[ch]; 578 if (latin1_char != NULL) { 579 if (unicode != latin1_char) { 580 Py_INCREF(latin1_char); 581 Py_DECREF(unicode); 582 } 583 return latin1_char; 584 } 585 else { 586 assert(_PyUnicode_CheckConsistency(unicode, 1)); 587 Py_INCREF(unicode); 588 unicode_latin1[ch] = unicode; 589 return unicode; 590 } 591 } 592 } 593 594 assert(_PyUnicode_CheckConsistency(unicode, 1)); 595 return unicode; 596 } 597 598 static PyObject* 599 unicode_result(PyObject *unicode) 600 { 601 assert(_PyUnicode_CHECK(unicode)); 602 if (PyUnicode_IS_READY(unicode)) 603 return unicode_result_ready(unicode); 604 else 605 return unicode_result_wchar(unicode); 606 } 607 608 static PyObject* 609 unicode_result_unchanged(PyObject *unicode) 610 { 611 if (PyUnicode_CheckExact(unicode)) { 612 if (PyUnicode_READY(unicode) == -1) 613 return NULL; 614 Py_INCREF(unicode); 615 return unicode; 616 } 617 else 618 /* Subtype -- return genuine unicode string with the same value. */ 619 return _PyUnicode_Copy(unicode); 620 } 621 622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings: 623 ASCII, Latin1, UTF-8, etc. */ 624 static char* 625 backslashreplace(_PyBytesWriter *writer, char *str, 626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 627 { 628 Py_ssize_t size, i; 629 Py_UCS4 ch; 630 enum PyUnicode_Kind kind; 631 void *data; 632 633 assert(PyUnicode_IS_READY(unicode)); 634 kind = PyUnicode_KIND(unicode); 635 data = PyUnicode_DATA(unicode); 636 637 size = 0; 638 /* determine replacement size */ 639 for (i = collstart; i < collend; ++i) { 640 Py_ssize_t incr; 641 642 ch = PyUnicode_READ(kind, data, i); 643 if (ch < 0x100) 644 incr = 2+2; 645 else if (ch < 0x10000) 646 incr = 2+4; 647 else { 648 assert(ch <= MAX_UNICODE); 649 incr = 2+8; 650 } 651 if (size > PY_SSIZE_T_MAX - incr) { 652 PyErr_SetString(PyExc_OverflowError, 653 "encoded result is too long for a Python string"); 654 return NULL; 655 } 656 size += incr; 657 } 658 659 str = _PyBytesWriter_Prepare(writer, str, size); 660 if (str == NULL) 661 return NULL; 662 663 /* generate replacement */ 664 for (i = collstart; i < collend; ++i) { 665 ch = PyUnicode_READ(kind, data, i); 666 *str++ = '\\'; 667 if (ch >= 0x00010000) { 668 *str++ = 'U'; 669 *str++ = Py_hexdigits[(ch>>28)&0xf]; 670 *str++ = Py_hexdigits[(ch>>24)&0xf]; 671 *str++ = Py_hexdigits[(ch>>20)&0xf]; 672 *str++ = Py_hexdigits[(ch>>16)&0xf]; 673 *str++ = Py_hexdigits[(ch>>12)&0xf]; 674 *str++ = Py_hexdigits[(ch>>8)&0xf]; 675 } 676 else if (ch >= 0x100) { 677 *str++ = 'u'; 678 *str++ = Py_hexdigits[(ch>>12)&0xf]; 679 *str++ = Py_hexdigits[(ch>>8)&0xf]; 680 } 681 else 682 *str++ = 'x'; 683 *str++ = Py_hexdigits[(ch>>4)&0xf]; 684 *str++ = Py_hexdigits[ch&0xf]; 685 } 686 return str; 687 } 688 689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: 690 ASCII, Latin1, UTF-8, etc. */ 691 static char* 692 xmlcharrefreplace(_PyBytesWriter *writer, char *str, 693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) 694 { 695 Py_ssize_t size, i; 696 Py_UCS4 ch; 697 enum PyUnicode_Kind kind; 698 void *data; 699 700 assert(PyUnicode_IS_READY(unicode)); 701 kind = PyUnicode_KIND(unicode); 702 data = PyUnicode_DATA(unicode); 703 704 size = 0; 705 /* determine replacement size */ 706 for (i = collstart; i < collend; ++i) { 707 Py_ssize_t incr; 708 709 ch = PyUnicode_READ(kind, data, i); 710 if (ch < 10) 711 incr = 2+1+1; 712 else if (ch < 100) 713 incr = 2+2+1; 714 else if (ch < 1000) 715 incr = 2+3+1; 716 else if (ch < 10000) 717 incr = 2+4+1; 718 else if (ch < 100000) 719 incr = 2+5+1; 720 else if (ch < 1000000) 721 incr = 2+6+1; 722 else { 723 assert(ch <= MAX_UNICODE); 724 incr = 2+7+1; 725 } 726 if (size > PY_SSIZE_T_MAX - incr) { 727 PyErr_SetString(PyExc_OverflowError, 728 "encoded result is too long for a Python string"); 729 return NULL; 730 } 731 size += incr; 732 } 733 734 str = _PyBytesWriter_Prepare(writer, str, size); 735 if (str == NULL) 736 return NULL; 737 738 /* generate replacement */ 739 for (i = collstart; i < collend; ++i) { 740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 741 } 742 return str; 743 } 744 745 /* --- Bloom Filters ----------------------------------------------------- */ 746 747 /* stuff to implement simple "bloom filters" for Unicode characters. 748 to keep things simple, we use a single bitmask, using the least 5 749 bits from each unicode characters as the bit index. */ 750 751 /* the linebreak mask is set up by Unicode_Init below */ 752 753 #if LONG_BIT >= 128 754 #define BLOOM_WIDTH 128 755 #elif LONG_BIT >= 64 756 #define BLOOM_WIDTH 64 757 #elif LONG_BIT >= 32 758 #define BLOOM_WIDTH 32 759 #else 760 #error "LONG_BIT is smaller than 32" 761 #endif 762 763 #define BLOOM_MASK unsigned long 764 765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 766 767 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 768 769 #define BLOOM_LINEBREAK(ch) \ 770 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 772 773 static inline BLOOM_MASK 774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 775 { 776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 777 do { \ 778 TYPE *data = (TYPE *)PTR; \ 779 TYPE *end = data + LEN; \ 780 Py_UCS4 ch; \ 781 for (; data != end; data++) { \ 782 ch = *data; \ 783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 784 } \ 785 break; \ 786 } while (0) 787 788 /* calculate simple bloom-style bitmask for a given unicode string */ 789 790 BLOOM_MASK mask; 791 792 mask = 0; 793 switch (kind) { 794 case PyUnicode_1BYTE_KIND: 795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 796 break; 797 case PyUnicode_2BYTE_KIND: 798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 799 break; 800 case PyUnicode_4BYTE_KIND: 801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 802 break; 803 default: 804 Py_UNREACHABLE(); 805 } 806 return mask; 807 808 #undef BLOOM_UPDATE 809 } 810 811 static int 812 ensure_unicode(PyObject *obj) 813 { 814 if (!PyUnicode_Check(obj)) { 815 PyErr_Format(PyExc_TypeError, 816 "must be str, not %.100s", 817 Py_TYPE(obj)->tp_name); 818 return -1; 819 } 820 return PyUnicode_READY(obj); 821 } 822 823 /* Compilation of templated routines */ 824 825 #include "stringlib/asciilib.h" 826 #include "stringlib/fastsearch.h" 827 #include "stringlib/partition.h" 828 #include "stringlib/split.h" 829 #include "stringlib/count.h" 830 #include "stringlib/find.h" 831 #include "stringlib/find_max_char.h" 832 #include "stringlib/undef.h" 833 834 #include "stringlib/ucs1lib.h" 835 #include "stringlib/fastsearch.h" 836 #include "stringlib/partition.h" 837 #include "stringlib/split.h" 838 #include "stringlib/count.h" 839 #include "stringlib/find.h" 840 #include "stringlib/replace.h" 841 #include "stringlib/find_max_char.h" 842 #include "stringlib/undef.h" 843 844 #include "stringlib/ucs2lib.h" 845 #include "stringlib/fastsearch.h" 846 #include "stringlib/partition.h" 847 #include "stringlib/split.h" 848 #include "stringlib/count.h" 849 #include "stringlib/find.h" 850 #include "stringlib/replace.h" 851 #include "stringlib/find_max_char.h" 852 #include "stringlib/undef.h" 853 854 #include "stringlib/ucs4lib.h" 855 #include "stringlib/fastsearch.h" 856 #include "stringlib/partition.h" 857 #include "stringlib/split.h" 858 #include "stringlib/count.h" 859 #include "stringlib/find.h" 860 #include "stringlib/replace.h" 861 #include "stringlib/find_max_char.h" 862 #include "stringlib/undef.h" 863 864 #include "stringlib/unicodedefs.h" 865 #include "stringlib/fastsearch.h" 866 #include "stringlib/count.h" 867 #include "stringlib/find.h" 868 #include "stringlib/undef.h" 869 870 /* --- Unicode Object ----------------------------------------------------- */ 871 872 static inline Py_ssize_t 873 findchar(const void *s, int kind, 874 Py_ssize_t size, Py_UCS4 ch, 875 int direction) 876 { 877 switch (kind) { 878 case PyUnicode_1BYTE_KIND: 879 if ((Py_UCS1) ch != ch) 880 return -1; 881 if (direction > 0) 882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 883 else 884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); 885 case PyUnicode_2BYTE_KIND: 886 if ((Py_UCS2) ch != ch) 887 return -1; 888 if (direction > 0) 889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 890 else 891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); 892 case PyUnicode_4BYTE_KIND: 893 if (direction > 0) 894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch); 895 else 896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); 897 default: 898 Py_UNREACHABLE(); 899 } 900 } 901 902 #ifdef Py_DEBUG 903 /* Fill the data of a Unicode string with invalid characters to detect bugs 904 earlier. 905 906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 908 invalid character in Unicode 6.0. */ 909 static void 910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 911 { 912 int kind = PyUnicode_KIND(unicode); 913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 914 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 915 if (length <= old_length) 916 return; 917 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 918 } 919 #endif 920 921 static PyObject* 922 resize_compact(PyObject *unicode, Py_ssize_t length) 923 { 924 Py_ssize_t char_size; 925 Py_ssize_t struct_size; 926 Py_ssize_t new_size; 927 int share_wstr; 928 PyObject *new_unicode; 929 #ifdef Py_DEBUG 930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 931 #endif 932 933 assert(unicode_modifiable(unicode)); 934 assert(PyUnicode_IS_READY(unicode)); 935 assert(PyUnicode_IS_COMPACT(unicode)); 936 937 char_size = PyUnicode_KIND(unicode); 938 if (PyUnicode_IS_ASCII(unicode)) 939 struct_size = sizeof(PyASCIIObject); 940 else 941 struct_size = sizeof(PyCompactUnicodeObject); 942 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 943 944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 945 PyErr_NoMemory(); 946 return NULL; 947 } 948 new_size = (struct_size + (length + 1) * char_size); 949 950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { 951 PyObject_DEL(_PyUnicode_UTF8(unicode)); 952 _PyUnicode_UTF8(unicode) = NULL; 953 _PyUnicode_UTF8_LENGTH(unicode) = 0; 954 } 955 _Py_DEC_REFTOTAL; 956 _Py_ForgetReference(unicode); 957 958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 959 if (new_unicode == NULL) { 960 _Py_NewReference(unicode); 961 PyErr_NoMemory(); 962 return NULL; 963 } 964 unicode = new_unicode; 965 _Py_NewReference(unicode); 966 967 _PyUnicode_LENGTH(unicode) = length; 968 if (share_wstr) { 969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 970 if (!PyUnicode_IS_ASCII(unicode)) 971 _PyUnicode_WSTR_LENGTH(unicode) = length; 972 } 973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 974 PyObject_DEL(_PyUnicode_WSTR(unicode)); 975 _PyUnicode_WSTR(unicode) = NULL; 976 if (!PyUnicode_IS_ASCII(unicode)) 977 _PyUnicode_WSTR_LENGTH(unicode) = 0; 978 } 979 #ifdef Py_DEBUG 980 unicode_fill_invalid(unicode, old_length); 981 #endif 982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 983 length, 0); 984 assert(_PyUnicode_CheckConsistency(unicode, 0)); 985 return unicode; 986 } 987 988 static int 989 resize_inplace(PyObject *unicode, Py_ssize_t length) 990 { 991 wchar_t *wstr; 992 Py_ssize_t new_size; 993 assert(!PyUnicode_IS_COMPACT(unicode)); 994 assert(Py_REFCNT(unicode) == 1); 995 996 if (PyUnicode_IS_READY(unicode)) { 997 Py_ssize_t char_size; 998 int share_wstr, share_utf8; 999 void *data; 1000 #ifdef Py_DEBUG 1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 1002 #endif 1003 1004 data = _PyUnicode_DATA_ANY(unicode); 1005 char_size = PyUnicode_KIND(unicode); 1006 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 1008 1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 1010 PyErr_NoMemory(); 1011 return -1; 1012 } 1013 new_size = (length + 1) * char_size; 1014 1015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 1016 { 1017 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1018 _PyUnicode_UTF8(unicode) = NULL; 1019 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1020 } 1021 1022 data = (PyObject *)PyObject_REALLOC(data, new_size); 1023 if (data == NULL) { 1024 PyErr_NoMemory(); 1025 return -1; 1026 } 1027 _PyUnicode_DATA_ANY(unicode) = data; 1028 if (share_wstr) { 1029 _PyUnicode_WSTR(unicode) = data; 1030 _PyUnicode_WSTR_LENGTH(unicode) = length; 1031 } 1032 if (share_utf8) { 1033 _PyUnicode_UTF8(unicode) = data; 1034 _PyUnicode_UTF8_LENGTH(unicode) = length; 1035 } 1036 _PyUnicode_LENGTH(unicode) = length; 1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 1038 #ifdef Py_DEBUG 1039 unicode_fill_invalid(unicode, old_length); 1040 #endif 1041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 1042 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1043 return 0; 1044 } 1045 } 1046 assert(_PyUnicode_WSTR(unicode) != NULL); 1047 1048 /* check for integer overflow */ 1049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 1050 PyErr_NoMemory(); 1051 return -1; 1052 } 1053 new_size = sizeof(wchar_t) * (length + 1); 1054 wstr = _PyUnicode_WSTR(unicode); 1055 wstr = PyObject_REALLOC(wstr, new_size); 1056 if (!wstr) { 1057 PyErr_NoMemory(); 1058 return -1; 1059 } 1060 _PyUnicode_WSTR(unicode) = wstr; 1061 _PyUnicode_WSTR(unicode)[length] = 0; 1062 _PyUnicode_WSTR_LENGTH(unicode) = length; 1063 assert(_PyUnicode_CheckConsistency(unicode, 0)); 1064 return 0; 1065 } 1066 1067 static PyObject* 1068 resize_copy(PyObject *unicode, Py_ssize_t length) 1069 { 1070 Py_ssize_t copy_length; 1071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 1072 PyObject *copy; 1073 1074 assert(PyUnicode_IS_READY(unicode)); 1075 1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 1077 if (copy == NULL) 1078 return NULL; 1079 1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 1081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 1082 return copy; 1083 } 1084 else { 1085 PyObject *w; 1086 1087 w = (PyObject*)_PyUnicode_New(length); 1088 if (w == NULL) 1089 return NULL; 1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 1091 copy_length = Py_MIN(copy_length, length); 1092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 1093 copy_length * sizeof(wchar_t)); 1094 return w; 1095 } 1096 } 1097 1098 /* We allocate one more byte to make sure the string is 1099 Ux0000 terminated; some code (e.g. new_identifier) 1100 relies on that. 1101 1102 XXX This allocator could further be enhanced by assuring that the 1103 free list never reduces its size below 1. 1104 1105 */ 1106 1107 static PyUnicodeObject * 1108 _PyUnicode_New(Py_ssize_t length) 1109 { 1110 PyUnicodeObject *unicode; 1111 size_t new_size; 1112 1113 /* Optimization for empty strings */ 1114 if (length == 0 && unicode_empty != NULL) { 1115 Py_INCREF(unicode_empty); 1116 return (PyUnicodeObject*)unicode_empty; 1117 } 1118 1119 /* Ensure we won't overflow the size. */ 1120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 1121 return (PyUnicodeObject *)PyErr_NoMemory(); 1122 } 1123 if (length < 0) { 1124 PyErr_SetString(PyExc_SystemError, 1125 "Negative size passed to _PyUnicode_New"); 1126 return NULL; 1127 } 1128 1129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 1130 if (unicode == NULL) 1131 return NULL; 1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 1133 1134 _PyUnicode_WSTR_LENGTH(unicode) = length; 1135 _PyUnicode_HASH(unicode) = -1; 1136 _PyUnicode_STATE(unicode).interned = 0; 1137 _PyUnicode_STATE(unicode).kind = 0; 1138 _PyUnicode_STATE(unicode).compact = 0; 1139 _PyUnicode_STATE(unicode).ready = 0; 1140 _PyUnicode_STATE(unicode).ascii = 0; 1141 _PyUnicode_DATA_ANY(unicode) = NULL; 1142 _PyUnicode_LENGTH(unicode) = 0; 1143 _PyUnicode_UTF8(unicode) = NULL; 1144 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1145 1146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 1147 if (!_PyUnicode_WSTR(unicode)) { 1148 Py_DECREF(unicode); 1149 PyErr_NoMemory(); 1150 return NULL; 1151 } 1152 1153 /* Initialize the first element to guard against cases where 1154 * the caller fails before initializing str -- unicode_resize() 1155 * reads str[0], and the Keep-Alive optimization can keep memory 1156 * allocated for str alive across a call to unicode_dealloc(unicode). 1157 * We don't want unicode_resize to read uninitialized memory in 1158 * that case. 1159 */ 1160 _PyUnicode_WSTR(unicode)[0] = 0; 1161 _PyUnicode_WSTR(unicode)[length] = 0; 1162 1163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 1164 return unicode; 1165 } 1166 1167 static const char* 1168 unicode_kind_name(PyObject *unicode) 1169 { 1170 /* don't check consistency: unicode_kind_name() is called from 1171 _PyUnicode_Dump() */ 1172 if (!PyUnicode_IS_COMPACT(unicode)) 1173 { 1174 if (!PyUnicode_IS_READY(unicode)) 1175 return "wstr"; 1176 switch (PyUnicode_KIND(unicode)) 1177 { 1178 case PyUnicode_1BYTE_KIND: 1179 if (PyUnicode_IS_ASCII(unicode)) 1180 return "legacy ascii"; 1181 else 1182 return "legacy latin1"; 1183 case PyUnicode_2BYTE_KIND: 1184 return "legacy UCS2"; 1185 case PyUnicode_4BYTE_KIND: 1186 return "legacy UCS4"; 1187 default: 1188 return "<legacy invalid kind>"; 1189 } 1190 } 1191 assert(PyUnicode_IS_READY(unicode)); 1192 switch (PyUnicode_KIND(unicode)) { 1193 case PyUnicode_1BYTE_KIND: 1194 if (PyUnicode_IS_ASCII(unicode)) 1195 return "ascii"; 1196 else 1197 return "latin1"; 1198 case PyUnicode_2BYTE_KIND: 1199 return "UCS2"; 1200 case PyUnicode_4BYTE_KIND: 1201 return "UCS4"; 1202 default: 1203 return "<invalid compact kind>"; 1204 } 1205 } 1206 1207 #ifdef Py_DEBUG 1208 /* Functions wrapping macros for use in debugger */ 1209 char *_PyUnicode_utf8(void *unicode){ 1210 return PyUnicode_UTF8(unicode); 1211 } 1212 1213 void *_PyUnicode_compact_data(void *unicode) { 1214 return _PyUnicode_COMPACT_DATA(unicode); 1215 } 1216 void *_PyUnicode_data(void *unicode){ 1217 printf("obj %p\n", unicode); 1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1223 return PyUnicode_DATA(unicode); 1224 } 1225 1226 void 1227 _PyUnicode_Dump(PyObject *op) 1228 { 1229 PyASCIIObject *ascii = (PyASCIIObject *)op; 1230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1232 void *data; 1233 1234 if (ascii->state.compact) 1235 { 1236 if (ascii->state.ascii) 1237 data = (ascii + 1); 1238 else 1239 data = (compact + 1); 1240 } 1241 else 1242 data = unicode->data.any; 1243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1244 unicode_kind_name(op), ascii->length); 1245 1246 if (ascii->wstr == data) 1247 printf("shared "); 1248 printf("wstr=%p", ascii->wstr); 1249 1250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1252 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1253 printf("shared "); 1254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1255 compact->utf8, compact->utf8_length); 1256 } 1257 printf(", data=%p\n", data); 1258 } 1259 #endif 1260 1261 PyObject * 1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1263 { 1264 PyObject *obj; 1265 PyCompactUnicodeObject *unicode; 1266 void *data; 1267 enum PyUnicode_Kind kind; 1268 int is_sharing, is_ascii; 1269 Py_ssize_t char_size; 1270 Py_ssize_t struct_size; 1271 1272 /* Optimization for empty strings */ 1273 if (size == 0 && unicode_empty != NULL) { 1274 Py_INCREF(unicode_empty); 1275 return unicode_empty; 1276 } 1277 1278 is_ascii = 0; 1279 is_sharing = 0; 1280 struct_size = sizeof(PyCompactUnicodeObject); 1281 if (maxchar < 128) { 1282 kind = PyUnicode_1BYTE_KIND; 1283 char_size = 1; 1284 is_ascii = 1; 1285 struct_size = sizeof(PyASCIIObject); 1286 } 1287 else if (maxchar < 256) { 1288 kind = PyUnicode_1BYTE_KIND; 1289 char_size = 1; 1290 } 1291 else if (maxchar < 65536) { 1292 kind = PyUnicode_2BYTE_KIND; 1293 char_size = 2; 1294 if (sizeof(wchar_t) == 2) 1295 is_sharing = 1; 1296 } 1297 else { 1298 if (maxchar > MAX_UNICODE) { 1299 PyErr_SetString(PyExc_SystemError, 1300 "invalid maximum character passed to PyUnicode_New"); 1301 return NULL; 1302 } 1303 kind = PyUnicode_4BYTE_KIND; 1304 char_size = 4; 1305 if (sizeof(wchar_t) == 4) 1306 is_sharing = 1; 1307 } 1308 1309 /* Ensure we won't overflow the size. */ 1310 if (size < 0) { 1311 PyErr_SetString(PyExc_SystemError, 1312 "Negative size passed to PyUnicode_New"); 1313 return NULL; 1314 } 1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1316 return PyErr_NoMemory(); 1317 1318 /* Duplicated allocation code from _PyObject_New() instead of a call to 1319 * PyObject_New() so we are able to allocate space for the object and 1320 * it's data buffer. 1321 */ 1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1323 if (obj == NULL) 1324 return PyErr_NoMemory(); 1325 obj = PyObject_INIT(obj, &PyUnicode_Type); 1326 if (obj == NULL) 1327 return NULL; 1328 1329 unicode = (PyCompactUnicodeObject *)obj; 1330 if (is_ascii) 1331 data = ((PyASCIIObject*)obj) + 1; 1332 else 1333 data = unicode + 1; 1334 _PyUnicode_LENGTH(unicode) = size; 1335 _PyUnicode_HASH(unicode) = -1; 1336 _PyUnicode_STATE(unicode).interned = 0; 1337 _PyUnicode_STATE(unicode).kind = kind; 1338 _PyUnicode_STATE(unicode).compact = 1; 1339 _PyUnicode_STATE(unicode).ready = 1; 1340 _PyUnicode_STATE(unicode).ascii = is_ascii; 1341 if (is_ascii) { 1342 ((char*)data)[size] = 0; 1343 _PyUnicode_WSTR(unicode) = NULL; 1344 } 1345 else if (kind == PyUnicode_1BYTE_KIND) { 1346 ((char*)data)[size] = 0; 1347 _PyUnicode_WSTR(unicode) = NULL; 1348 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1349 unicode->utf8 = NULL; 1350 unicode->utf8_length = 0; 1351 } 1352 else { 1353 unicode->utf8 = NULL; 1354 unicode->utf8_length = 0; 1355 if (kind == PyUnicode_2BYTE_KIND) 1356 ((Py_UCS2*)data)[size] = 0; 1357 else /* kind == PyUnicode_4BYTE_KIND */ 1358 ((Py_UCS4*)data)[size] = 0; 1359 if (is_sharing) { 1360 _PyUnicode_WSTR_LENGTH(unicode) = size; 1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1362 } 1363 else { 1364 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1365 _PyUnicode_WSTR(unicode) = NULL; 1366 } 1367 } 1368 #ifdef Py_DEBUG 1369 unicode_fill_invalid((PyObject*)unicode, 0); 1370 #endif 1371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1372 return obj; 1373 } 1374 1375 #if SIZEOF_WCHAR_T == 2 1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1377 will decode surrogate pairs, the other conversions are implemented as macros 1378 for efficiency. 1379 1380 This function assumes that unicode can hold one more code point than wstr 1381 characters for a terminating null character. */ 1382 static void 1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1384 PyObject *unicode) 1385 { 1386 const wchar_t *iter; 1387 Py_UCS4 *ucs4_out; 1388 1389 assert(unicode != NULL); 1390 assert(_PyUnicode_CHECK(unicode)); 1391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1393 1394 for (iter = begin; iter < end; ) { 1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1396 _PyUnicode_GET_LENGTH(unicode))); 1397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1398 && (iter+1) < end 1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1400 { 1401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1402 iter += 2; 1403 } 1404 else { 1405 *ucs4_out++ = *iter; 1406 iter++; 1407 } 1408 } 1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1410 _PyUnicode_GET_LENGTH(unicode))); 1411 1412 } 1413 #endif 1414 1415 static int 1416 unicode_check_modifiable(PyObject *unicode) 1417 { 1418 if (!unicode_modifiable(unicode)) { 1419 PyErr_SetString(PyExc_SystemError, 1420 "Cannot modify a string currently used"); 1421 return -1; 1422 } 1423 return 0; 1424 } 1425 1426 static int 1427 _copy_characters(PyObject *to, Py_ssize_t to_start, 1428 PyObject *from, Py_ssize_t from_start, 1429 Py_ssize_t how_many, int check_maxchar) 1430 { 1431 unsigned int from_kind, to_kind; 1432 void *from_data, *to_data; 1433 1434 assert(0 <= how_many); 1435 assert(0 <= from_start); 1436 assert(0 <= to_start); 1437 assert(PyUnicode_Check(from)); 1438 assert(PyUnicode_IS_READY(from)); 1439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1440 1441 assert(PyUnicode_Check(to)); 1442 assert(PyUnicode_IS_READY(to)); 1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1444 1445 if (how_many == 0) 1446 return 0; 1447 1448 from_kind = PyUnicode_KIND(from); 1449 from_data = PyUnicode_DATA(from); 1450 to_kind = PyUnicode_KIND(to); 1451 to_data = PyUnicode_DATA(to); 1452 1453 #ifdef Py_DEBUG 1454 if (!check_maxchar 1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1456 { 1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1458 Py_UCS4 ch; 1459 Py_ssize_t i; 1460 for (i=0; i < how_many; i++) { 1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1462 assert(ch <= to_maxchar); 1463 } 1464 } 1465 #endif 1466 1467 if (from_kind == to_kind) { 1468 if (check_maxchar 1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1470 { 1471 /* Writing Latin-1 characters into an ASCII string requires to 1472 check that all written characters are pure ASCII */ 1473 Py_UCS4 max_char; 1474 max_char = ucs1lib_find_max_char(from_data, 1475 (Py_UCS1*)from_data + how_many); 1476 if (max_char >= 128) 1477 return -1; 1478 } 1479 memcpy((char*)to_data + to_kind * to_start, 1480 (char*)from_data + from_kind * from_start, 1481 to_kind * how_many); 1482 } 1483 else if (from_kind == PyUnicode_1BYTE_KIND 1484 && to_kind == PyUnicode_2BYTE_KIND) 1485 { 1486 _PyUnicode_CONVERT_BYTES( 1487 Py_UCS1, Py_UCS2, 1488 PyUnicode_1BYTE_DATA(from) + from_start, 1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1490 PyUnicode_2BYTE_DATA(to) + to_start 1491 ); 1492 } 1493 else if (from_kind == PyUnicode_1BYTE_KIND 1494 && to_kind == PyUnicode_4BYTE_KIND) 1495 { 1496 _PyUnicode_CONVERT_BYTES( 1497 Py_UCS1, Py_UCS4, 1498 PyUnicode_1BYTE_DATA(from) + from_start, 1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1500 PyUnicode_4BYTE_DATA(to) + to_start 1501 ); 1502 } 1503 else if (from_kind == PyUnicode_2BYTE_KIND 1504 && to_kind == PyUnicode_4BYTE_KIND) 1505 { 1506 _PyUnicode_CONVERT_BYTES( 1507 Py_UCS2, Py_UCS4, 1508 PyUnicode_2BYTE_DATA(from) + from_start, 1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1510 PyUnicode_4BYTE_DATA(to) + to_start 1511 ); 1512 } 1513 else { 1514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1515 1516 if (!check_maxchar) { 1517 if (from_kind == PyUnicode_2BYTE_KIND 1518 && to_kind == PyUnicode_1BYTE_KIND) 1519 { 1520 _PyUnicode_CONVERT_BYTES( 1521 Py_UCS2, Py_UCS1, 1522 PyUnicode_2BYTE_DATA(from) + from_start, 1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1524 PyUnicode_1BYTE_DATA(to) + to_start 1525 ); 1526 } 1527 else if (from_kind == PyUnicode_4BYTE_KIND 1528 && to_kind == PyUnicode_1BYTE_KIND) 1529 { 1530 _PyUnicode_CONVERT_BYTES( 1531 Py_UCS4, Py_UCS1, 1532 PyUnicode_4BYTE_DATA(from) + from_start, 1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1534 PyUnicode_1BYTE_DATA(to) + to_start 1535 ); 1536 } 1537 else if (from_kind == PyUnicode_4BYTE_KIND 1538 && to_kind == PyUnicode_2BYTE_KIND) 1539 { 1540 _PyUnicode_CONVERT_BYTES( 1541 Py_UCS4, Py_UCS2, 1542 PyUnicode_4BYTE_DATA(from) + from_start, 1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1544 PyUnicode_2BYTE_DATA(to) + to_start 1545 ); 1546 } 1547 else { 1548 Py_UNREACHABLE(); 1549 } 1550 } 1551 else { 1552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1553 Py_UCS4 ch; 1554 Py_ssize_t i; 1555 1556 for (i=0; i < how_many; i++) { 1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1558 if (ch > to_maxchar) 1559 return -1; 1560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1561 } 1562 } 1563 } 1564 return 0; 1565 } 1566 1567 void 1568 _PyUnicode_FastCopyCharacters( 1569 PyObject *to, Py_ssize_t to_start, 1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1571 { 1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1573 } 1574 1575 Py_ssize_t 1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1577 PyObject *from, Py_ssize_t from_start, 1578 Py_ssize_t how_many) 1579 { 1580 int err; 1581 1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1583 PyErr_BadInternalCall(); 1584 return -1; 1585 } 1586 1587 if (PyUnicode_READY(from) == -1) 1588 return -1; 1589 if (PyUnicode_READY(to) == -1) 1590 return -1; 1591 1592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { 1593 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1594 return -1; 1595 } 1596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) { 1597 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1598 return -1; 1599 } 1600 if (how_many < 0) { 1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative"); 1602 return -1; 1603 } 1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many); 1605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1606 PyErr_Format(PyExc_SystemError, 1607 "Cannot write %zi characters at %zi " 1608 "in a string of %zi characters", 1609 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1610 return -1; 1611 } 1612 1613 if (how_many == 0) 1614 return 0; 1615 1616 if (unicode_check_modifiable(to)) 1617 return -1; 1618 1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1620 if (err) { 1621 PyErr_Format(PyExc_SystemError, 1622 "Cannot copy %s characters " 1623 "into a string of %s characters", 1624 unicode_kind_name(from), 1625 unicode_kind_name(to)); 1626 return -1; 1627 } 1628 return how_many; 1629 } 1630 1631 /* Find the maximum code point and count the number of surrogate pairs so a 1632 correct string length can be computed before converting a string to UCS4. 1633 This function counts single surrogates as a character and not as a pair. 1634 1635 Return 0 on success, or -1 on error. */ 1636 static int 1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1639 { 1640 const wchar_t *iter; 1641 Py_UCS4 ch; 1642 1643 assert(num_surrogates != NULL && maxchar != NULL); 1644 *num_surrogates = 0; 1645 *maxchar = 0; 1646 1647 for (iter = begin; iter < end; ) { 1648 #if SIZEOF_WCHAR_T == 2 1649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1650 && (iter+1) < end 1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1652 { 1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1654 ++(*num_surrogates); 1655 iter += 2; 1656 } 1657 else 1658 #endif 1659 { 1660 ch = *iter; 1661 iter++; 1662 } 1663 if (ch > *maxchar) { 1664 *maxchar = ch; 1665 if (*maxchar > MAX_UNICODE) { 1666 PyErr_Format(PyExc_ValueError, 1667 "character U+%x is not in range [U+0000; U+10ffff]", 1668 ch); 1669 return -1; 1670 } 1671 } 1672 } 1673 return 0; 1674 } 1675 1676 int 1677 _PyUnicode_Ready(PyObject *unicode) 1678 { 1679 wchar_t *end; 1680 Py_UCS4 maxchar = 0; 1681 Py_ssize_t num_surrogates; 1682 #if SIZEOF_WCHAR_T == 2 1683 Py_ssize_t length_wo_surrogates; 1684 #endif 1685 1686 /* _PyUnicode_Ready() is only intended for old-style API usage where 1687 strings were created using _PyObject_New() and where no canonical 1688 representation (the str field) has been set yet aka strings 1689 which are not yet ready. */ 1690 assert(_PyUnicode_CHECK(unicode)); 1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1692 assert(_PyUnicode_WSTR(unicode) != NULL); 1693 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1694 assert(_PyUnicode_UTF8(unicode) == NULL); 1695 /* Actually, it should neither be interned nor be anything else: */ 1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1697 1698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1700 &maxchar, &num_surrogates) == -1) 1701 return -1; 1702 1703 if (maxchar < 256) { 1704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1705 if (!_PyUnicode_DATA_ANY(unicode)) { 1706 PyErr_NoMemory(); 1707 return -1; 1708 } 1709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1710 _PyUnicode_WSTR(unicode), end, 1711 PyUnicode_1BYTE_DATA(unicode)); 1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1715 if (maxchar < 128) { 1716 _PyUnicode_STATE(unicode).ascii = 1; 1717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1719 } 1720 else { 1721 _PyUnicode_STATE(unicode).ascii = 0; 1722 _PyUnicode_UTF8(unicode) = NULL; 1723 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1724 } 1725 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1726 _PyUnicode_WSTR(unicode) = NULL; 1727 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1728 } 1729 /* In this case we might have to convert down from 4-byte native 1730 wchar_t to 2-byte unicode. */ 1731 else if (maxchar < 65536) { 1732 assert(num_surrogates == 0 && 1733 "FindMaxCharAndNumSurrogatePairs() messed up"); 1734 1735 #if SIZEOF_WCHAR_T == 2 1736 /* We can share representations and are done. */ 1737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1741 _PyUnicode_UTF8(unicode) = NULL; 1742 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1743 #else 1744 /* sizeof(wchar_t) == 4 */ 1745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1747 if (!_PyUnicode_DATA_ANY(unicode)) { 1748 PyErr_NoMemory(); 1749 return -1; 1750 } 1751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1752 _PyUnicode_WSTR(unicode), end, 1753 PyUnicode_2BYTE_DATA(unicode)); 1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1757 _PyUnicode_UTF8(unicode) = NULL; 1758 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1759 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1760 _PyUnicode_WSTR(unicode) = NULL; 1761 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1762 #endif 1763 } 1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1765 else { 1766 #if SIZEOF_WCHAR_T == 2 1767 /* in case the native representation is 2-bytes, we need to allocate a 1768 new normalized 4-byte version. */ 1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1771 PyErr_NoMemory(); 1772 return -1; 1773 } 1774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1775 if (!_PyUnicode_DATA_ANY(unicode)) { 1776 PyErr_NoMemory(); 1777 return -1; 1778 } 1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1781 _PyUnicode_UTF8(unicode) = NULL; 1782 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1783 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1784 _PyUnicode_STATE(unicode).ready = 1; 1785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1786 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1787 _PyUnicode_WSTR(unicode) = NULL; 1788 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1789 #else 1790 assert(num_surrogates == 0); 1791 1792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1794 _PyUnicode_UTF8(unicode) = NULL; 1795 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1797 #endif 1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1799 } 1800 _PyUnicode_STATE(unicode).ready = 1; 1801 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1802 return 0; 1803 } 1804 1805 static void 1806 unicode_dealloc(PyObject *unicode) 1807 { 1808 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1809 case SSTATE_NOT_INTERNED: 1810 break; 1811 1812 case SSTATE_INTERNED_MORTAL: 1813 /* revive dead object temporarily for DelItem */ 1814 Py_REFCNT(unicode) = 3; 1815 if (PyDict_DelItem(interned, unicode) != 0) 1816 Py_FatalError( 1817 "deletion of interned string failed"); 1818 break; 1819 1820 case SSTATE_INTERNED_IMMORTAL: 1821 Py_FatalError("Immortal interned string died."); 1822 /* fall through */ 1823 1824 default: 1825 Py_FatalError("Inconsistent interned string state."); 1826 } 1827 1828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1829 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1831 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1834 1835 Py_TYPE(unicode)->tp_free(unicode); 1836 } 1837 1838 #ifdef Py_DEBUG 1839 static int 1840 unicode_is_singleton(PyObject *unicode) 1841 { 1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1843 if (unicode == unicode_empty) 1844 return 1; 1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1846 { 1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1848 if (ch < 256 && unicode_latin1[ch] == unicode) 1849 return 1; 1850 } 1851 return 0; 1852 } 1853 #endif 1854 1855 static int 1856 unicode_modifiable(PyObject *unicode) 1857 { 1858 assert(_PyUnicode_CHECK(unicode)); 1859 if (Py_REFCNT(unicode) != 1) 1860 return 0; 1861 if (_PyUnicode_HASH(unicode) != -1) 1862 return 0; 1863 if (PyUnicode_CHECK_INTERNED(unicode)) 1864 return 0; 1865 if (!PyUnicode_CheckExact(unicode)) 1866 return 0; 1867 #ifdef Py_DEBUG 1868 /* singleton refcount is greater than 1 */ 1869 assert(!unicode_is_singleton(unicode)); 1870 #endif 1871 return 1; 1872 } 1873 1874 static int 1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1876 { 1877 PyObject *unicode; 1878 Py_ssize_t old_length; 1879 1880 assert(p_unicode != NULL); 1881 unicode = *p_unicode; 1882 1883 assert(unicode != NULL); 1884 assert(PyUnicode_Check(unicode)); 1885 assert(0 <= length); 1886 1887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1888 old_length = PyUnicode_WSTR_LENGTH(unicode); 1889 else 1890 old_length = PyUnicode_GET_LENGTH(unicode); 1891 if (old_length == length) 1892 return 0; 1893 1894 if (length == 0) { 1895 _Py_INCREF_UNICODE_EMPTY(); 1896 if (!unicode_empty) 1897 return -1; 1898 Py_SETREF(*p_unicode, unicode_empty); 1899 return 0; 1900 } 1901 1902 if (!unicode_modifiable(unicode)) { 1903 PyObject *copy = resize_copy(unicode, length); 1904 if (copy == NULL) 1905 return -1; 1906 Py_SETREF(*p_unicode, copy); 1907 return 0; 1908 } 1909 1910 if (PyUnicode_IS_COMPACT(unicode)) { 1911 PyObject *new_unicode = resize_compact(unicode, length); 1912 if (new_unicode == NULL) 1913 return -1; 1914 *p_unicode = new_unicode; 1915 return 0; 1916 } 1917 return resize_inplace(unicode, length); 1918 } 1919 1920 int 1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1922 { 1923 PyObject *unicode; 1924 if (p_unicode == NULL) { 1925 PyErr_BadInternalCall(); 1926 return -1; 1927 } 1928 unicode = *p_unicode; 1929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1930 { 1931 PyErr_BadInternalCall(); 1932 return -1; 1933 } 1934 return unicode_resize(p_unicode, length); 1935 } 1936 1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string. 1938 1939 WARNING: The function doesn't copy the terminating null character and 1940 doesn't check the maximum character (may write a latin1 character in an 1941 ASCII string). */ 1942 static void 1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1944 const char *str, Py_ssize_t len) 1945 { 1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1947 void *data = PyUnicode_DATA(unicode); 1948 const char *end = str + len; 1949 1950 switch (kind) { 1951 case PyUnicode_1BYTE_KIND: { 1952 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1953 #ifdef Py_DEBUG 1954 if (PyUnicode_IS_ASCII(unicode)) { 1955 Py_UCS4 maxchar = ucs1lib_find_max_char( 1956 (const Py_UCS1*)str, 1957 (const Py_UCS1*)str + len); 1958 assert(maxchar < 128); 1959 } 1960 #endif 1961 memcpy((char *) data + index, str, len); 1962 break; 1963 } 1964 case PyUnicode_2BYTE_KIND: { 1965 Py_UCS2 *start = (Py_UCS2 *)data + index; 1966 Py_UCS2 *ucs2 = start; 1967 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1968 1969 for (; str < end; ++ucs2, ++str) 1970 *ucs2 = (Py_UCS2)*str; 1971 1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1973 break; 1974 } 1975 default: { 1976 Py_UCS4 *start = (Py_UCS4 *)data + index; 1977 Py_UCS4 *ucs4 = start; 1978 assert(kind == PyUnicode_4BYTE_KIND); 1979 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1980 1981 for (; str < end; ++ucs4, ++str) 1982 *ucs4 = (Py_UCS4)*str; 1983 1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1985 } 1986 } 1987 } 1988 1989 static PyObject* 1990 get_latin1_char(unsigned char ch) 1991 { 1992 PyObject *unicode = unicode_latin1[ch]; 1993 if (!unicode) { 1994 unicode = PyUnicode_New(1, ch); 1995 if (!unicode) 1996 return NULL; 1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1998 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1999 unicode_latin1[ch] = unicode; 2000 } 2001 Py_INCREF(unicode); 2002 return unicode; 2003 } 2004 2005 static PyObject* 2006 unicode_char(Py_UCS4 ch) 2007 { 2008 PyObject *unicode; 2009 2010 assert(ch <= MAX_UNICODE); 2011 2012 if (ch < 256) 2013 return get_latin1_char(ch); 2014 2015 unicode = PyUnicode_New(1, ch); 2016 if (unicode == NULL) 2017 return NULL; 2018 2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); 2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 2021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 2022 } else { 2023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 2025 } 2026 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2027 return unicode; 2028 } 2029 2030 PyObject * 2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 2032 { 2033 if (u == NULL) 2034 return (PyObject*)_PyUnicode_New(size); 2035 2036 if (size < 0) { 2037 PyErr_BadInternalCall(); 2038 return NULL; 2039 } 2040 2041 return PyUnicode_FromWideChar(u, size); 2042 } 2043 2044 PyObject * 2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) 2046 { 2047 PyObject *unicode; 2048 Py_UCS4 maxchar = 0; 2049 Py_ssize_t num_surrogates; 2050 2051 if (u == NULL && size != 0) { 2052 PyErr_BadInternalCall(); 2053 return NULL; 2054 } 2055 2056 if (size == -1) { 2057 size = wcslen(u); 2058 } 2059 2060 /* If the Unicode data is known at construction time, we can apply 2061 some optimizations which share commonly used objects. */ 2062 2063 /* Optimization for empty strings */ 2064 if (size == 0) 2065 _Py_RETURN_UNICODE_EMPTY(); 2066 2067 /* Single character Unicode objects in the Latin-1 range are 2068 shared when using this constructor */ 2069 if (size == 1 && (Py_UCS4)*u < 256) 2070 return get_latin1_char((unsigned char)*u); 2071 2072 /* If not empty and not single character, copy the Unicode data 2073 into the new object */ 2074 if (find_maxchar_surrogates(u, u + size, 2075 &maxchar, &num_surrogates) == -1) 2076 return NULL; 2077 2078 unicode = PyUnicode_New(size - num_surrogates, maxchar); 2079 if (!unicode) 2080 return NULL; 2081 2082 switch (PyUnicode_KIND(unicode)) { 2083 case PyUnicode_1BYTE_KIND: 2084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 2085 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 2086 break; 2087 case PyUnicode_2BYTE_KIND: 2088 #if Py_UNICODE_SIZE == 2 2089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 2090 #else 2091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 2092 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 2093 #endif 2094 break; 2095 case PyUnicode_4BYTE_KIND: 2096 #if SIZEOF_WCHAR_T == 2 2097 /* This is the only case which has to process surrogates, thus 2098 a simple copy loop is not enough and we need a function. */ 2099 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 2100 #else 2101 assert(num_surrogates == 0); 2102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 2103 #endif 2104 break; 2105 default: 2106 Py_UNREACHABLE(); 2107 } 2108 2109 return unicode_result(unicode); 2110 } 2111 2112 PyObject * 2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 2114 { 2115 if (size < 0) { 2116 PyErr_SetString(PyExc_SystemError, 2117 "Negative size passed to PyUnicode_FromStringAndSize"); 2118 return NULL; 2119 } 2120 if (u != NULL) 2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 2122 else 2123 return (PyObject *)_PyUnicode_New(size); 2124 } 2125 2126 PyObject * 2127 PyUnicode_FromString(const char *u) 2128 { 2129 size_t size = strlen(u); 2130 if (size > PY_SSIZE_T_MAX) { 2131 PyErr_SetString(PyExc_OverflowError, "input too long"); 2132 return NULL; 2133 } 2134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 2135 } 2136 2137 PyObject * 2138 _PyUnicode_FromId(_Py_Identifier *id) 2139 { 2140 if (!id->object) { 2141 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 2142 strlen(id->string), 2143 NULL, NULL); 2144 if (!id->object) 2145 return NULL; 2146 PyUnicode_InternInPlace(&id->object); 2147 assert(!id->next); 2148 id->next = static_strings; 2149 static_strings = id; 2150 } 2151 return id->object; 2152 } 2153 2154 void 2155 _PyUnicode_ClearStaticStrings() 2156 { 2157 _Py_Identifier *tmp, *s = static_strings; 2158 while (s) { 2159 Py_CLEAR(s->object); 2160 tmp = s->next; 2161 s->next = NULL; 2162 s = tmp; 2163 } 2164 static_strings = NULL; 2165 } 2166 2167 /* Internal function, doesn't check maximum character */ 2168 2169 PyObject* 2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 2171 { 2172 const unsigned char *s = (const unsigned char *)buffer; 2173 PyObject *unicode; 2174 if (size == 1) { 2175 #ifdef Py_DEBUG 2176 assert((unsigned char)s[0] < 128); 2177 #endif 2178 return get_latin1_char(s[0]); 2179 } 2180 unicode = PyUnicode_New(size, 127); 2181 if (!unicode) 2182 return NULL; 2183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 2184 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2185 return unicode; 2186 } 2187 2188 static Py_UCS4 2189 kind_maxchar_limit(unsigned int kind) 2190 { 2191 switch (kind) { 2192 case PyUnicode_1BYTE_KIND: 2193 return 0x80; 2194 case PyUnicode_2BYTE_KIND: 2195 return 0x100; 2196 case PyUnicode_4BYTE_KIND: 2197 return 0x10000; 2198 default: 2199 Py_UNREACHABLE(); 2200 } 2201 } 2202 2203 static PyObject* 2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 2205 { 2206 PyObject *res; 2207 unsigned char max_char; 2208 2209 if (size == 0) 2210 _Py_RETURN_UNICODE_EMPTY(); 2211 assert(size > 0); 2212 if (size == 1) 2213 return get_latin1_char(u[0]); 2214 2215 max_char = ucs1lib_find_max_char(u, u + size); 2216 res = PyUnicode_New(size, max_char); 2217 if (!res) 2218 return NULL; 2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2220 assert(_PyUnicode_CheckConsistency(res, 1)); 2221 return res; 2222 } 2223 2224 static PyObject* 2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2226 { 2227 PyObject *res; 2228 Py_UCS2 max_char; 2229 2230 if (size == 0) 2231 _Py_RETURN_UNICODE_EMPTY(); 2232 assert(size > 0); 2233 if (size == 1) 2234 return unicode_char(u[0]); 2235 2236 max_char = ucs2lib_find_max_char(u, u + size); 2237 res = PyUnicode_New(size, max_char); 2238 if (!res) 2239 return NULL; 2240 if (max_char >= 256) 2241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2242 else { 2243 _PyUnicode_CONVERT_BYTES( 2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2245 } 2246 assert(_PyUnicode_CheckConsistency(res, 1)); 2247 return res; 2248 } 2249 2250 static PyObject* 2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2252 { 2253 PyObject *res; 2254 Py_UCS4 max_char; 2255 2256 if (size == 0) 2257 _Py_RETURN_UNICODE_EMPTY(); 2258 assert(size > 0); 2259 if (size == 1) 2260 return unicode_char(u[0]); 2261 2262 max_char = ucs4lib_find_max_char(u, u + size); 2263 res = PyUnicode_New(size, max_char); 2264 if (!res) 2265 return NULL; 2266 if (max_char < 256) 2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2268 PyUnicode_1BYTE_DATA(res)); 2269 else if (max_char < 0x10000) 2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2271 PyUnicode_2BYTE_DATA(res)); 2272 else 2273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2274 assert(_PyUnicode_CheckConsistency(res, 1)); 2275 return res; 2276 } 2277 2278 PyObject* 2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2280 { 2281 if (size < 0) { 2282 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2283 return NULL; 2284 } 2285 switch (kind) { 2286 case PyUnicode_1BYTE_KIND: 2287 return _PyUnicode_FromUCS1(buffer, size); 2288 case PyUnicode_2BYTE_KIND: 2289 return _PyUnicode_FromUCS2(buffer, size); 2290 case PyUnicode_4BYTE_KIND: 2291 return _PyUnicode_FromUCS4(buffer, size); 2292 default: 2293 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2294 return NULL; 2295 } 2296 } 2297 2298 Py_UCS4 2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2300 { 2301 enum PyUnicode_Kind kind; 2302 void *startptr, *endptr; 2303 2304 assert(PyUnicode_IS_READY(unicode)); 2305 assert(0 <= start); 2306 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2307 assert(start <= end); 2308 2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2310 return PyUnicode_MAX_CHAR_VALUE(unicode); 2311 2312 if (start == end) 2313 return 127; 2314 2315 if (PyUnicode_IS_ASCII(unicode)) 2316 return 127; 2317 2318 kind = PyUnicode_KIND(unicode); 2319 startptr = PyUnicode_DATA(unicode); 2320 endptr = (char *)startptr + end * kind; 2321 startptr = (char *)startptr + start * kind; 2322 switch(kind) { 2323 case PyUnicode_1BYTE_KIND: 2324 return ucs1lib_find_max_char(startptr, endptr); 2325 case PyUnicode_2BYTE_KIND: 2326 return ucs2lib_find_max_char(startptr, endptr); 2327 case PyUnicode_4BYTE_KIND: 2328 return ucs4lib_find_max_char(startptr, endptr); 2329 default: 2330 Py_UNREACHABLE(); 2331 } 2332 } 2333 2334 /* Ensure that a string uses the most efficient storage, if it is not the 2335 case: create a new string with of the right kind. Write NULL into *p_unicode 2336 on error. */ 2337 static void 2338 unicode_adjust_maxchar(PyObject **p_unicode) 2339 { 2340 PyObject *unicode, *copy; 2341 Py_UCS4 max_char; 2342 Py_ssize_t len; 2343 unsigned int kind; 2344 2345 assert(p_unicode != NULL); 2346 unicode = *p_unicode; 2347 assert(PyUnicode_IS_READY(unicode)); 2348 if (PyUnicode_IS_ASCII(unicode)) 2349 return; 2350 2351 len = PyUnicode_GET_LENGTH(unicode); 2352 kind = PyUnicode_KIND(unicode); 2353 if (kind == PyUnicode_1BYTE_KIND) { 2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2355 max_char = ucs1lib_find_max_char(u, u + len); 2356 if (max_char >= 128) 2357 return; 2358 } 2359 else if (kind == PyUnicode_2BYTE_KIND) { 2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2361 max_char = ucs2lib_find_max_char(u, u + len); 2362 if (max_char >= 256) 2363 return; 2364 } 2365 else { 2366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2367 assert(kind == PyUnicode_4BYTE_KIND); 2368 max_char = ucs4lib_find_max_char(u, u + len); 2369 if (max_char >= 0x10000) 2370 return; 2371 } 2372 copy = PyUnicode_New(len, max_char); 2373 if (copy != NULL) 2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2375 Py_DECREF(unicode); 2376 *p_unicode = copy; 2377 } 2378 2379 PyObject* 2380 _PyUnicode_Copy(PyObject *unicode) 2381 { 2382 Py_ssize_t length; 2383 PyObject *copy; 2384 2385 if (!PyUnicode_Check(unicode)) { 2386 PyErr_BadInternalCall(); 2387 return NULL; 2388 } 2389 if (PyUnicode_READY(unicode) == -1) 2390 return NULL; 2391 2392 length = PyUnicode_GET_LENGTH(unicode); 2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2394 if (!copy) 2395 return NULL; 2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2397 2398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2399 length * PyUnicode_KIND(unicode)); 2400 assert(_PyUnicode_CheckConsistency(copy, 1)); 2401 return copy; 2402 } 2403 2404 2405 /* Widen Unicode objects to larger buffers. Don't write terminating null 2406 character. Return NULL on error. */ 2407 2408 void* 2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind) 2410 { 2411 Py_ssize_t len; 2412 void *result; 2413 unsigned int skind; 2414 2415 if (PyUnicode_READY(s) == -1) 2416 return NULL; 2417 2418 len = PyUnicode_GET_LENGTH(s); 2419 skind = PyUnicode_KIND(s); 2420 if (skind >= kind) { 2421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2422 return NULL; 2423 } 2424 switch (kind) { 2425 case PyUnicode_2BYTE_KIND: 2426 result = PyMem_New(Py_UCS2, len); 2427 if (!result) 2428 return PyErr_NoMemory(); 2429 assert(skind == PyUnicode_1BYTE_KIND); 2430 _PyUnicode_CONVERT_BYTES( 2431 Py_UCS1, Py_UCS2, 2432 PyUnicode_1BYTE_DATA(s), 2433 PyUnicode_1BYTE_DATA(s) + len, 2434 result); 2435 return result; 2436 case PyUnicode_4BYTE_KIND: 2437 result = PyMem_New(Py_UCS4, len); 2438 if (!result) 2439 return PyErr_NoMemory(); 2440 if (skind == PyUnicode_2BYTE_KIND) { 2441 _PyUnicode_CONVERT_BYTES( 2442 Py_UCS2, Py_UCS4, 2443 PyUnicode_2BYTE_DATA(s), 2444 PyUnicode_2BYTE_DATA(s) + len, 2445 result); 2446 } 2447 else { 2448 assert(skind == PyUnicode_1BYTE_KIND); 2449 _PyUnicode_CONVERT_BYTES( 2450 Py_UCS1, Py_UCS4, 2451 PyUnicode_1BYTE_DATA(s), 2452 PyUnicode_1BYTE_DATA(s) + len, 2453 result); 2454 } 2455 return result; 2456 default: 2457 break; 2458 } 2459 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2460 return NULL; 2461 } 2462 2463 static Py_UCS4* 2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2465 int copy_null) 2466 { 2467 int kind; 2468 void *data; 2469 Py_ssize_t len, targetlen; 2470 if (PyUnicode_READY(string) == -1) 2471 return NULL; 2472 kind = PyUnicode_KIND(string); 2473 data = PyUnicode_DATA(string); 2474 len = PyUnicode_GET_LENGTH(string); 2475 targetlen = len; 2476 if (copy_null) 2477 targetlen++; 2478 if (!target) { 2479 target = PyMem_New(Py_UCS4, targetlen); 2480 if (!target) { 2481 PyErr_NoMemory(); 2482 return NULL; 2483 } 2484 } 2485 else { 2486 if (targetsize < targetlen) { 2487 PyErr_Format(PyExc_SystemError, 2488 "string is longer than the buffer"); 2489 if (copy_null && 0 < targetsize) 2490 target[0] = 0; 2491 return NULL; 2492 } 2493 } 2494 if (kind == PyUnicode_1BYTE_KIND) { 2495 Py_UCS1 *start = (Py_UCS1 *) data; 2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2497 } 2498 else if (kind == PyUnicode_2BYTE_KIND) { 2499 Py_UCS2 *start = (Py_UCS2 *) data; 2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2501 } 2502 else { 2503 assert(kind == PyUnicode_4BYTE_KIND); 2504 memcpy(target, data, len * sizeof(Py_UCS4)); 2505 } 2506 if (copy_null) 2507 target[len] = 0; 2508 return target; 2509 } 2510 2511 Py_UCS4* 2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2513 int copy_null) 2514 { 2515 if (target == NULL || targetsize < 0) { 2516 PyErr_BadInternalCall(); 2517 return NULL; 2518 } 2519 return as_ucs4(string, target, targetsize, copy_null); 2520 } 2521 2522 Py_UCS4* 2523 PyUnicode_AsUCS4Copy(PyObject *string) 2524 { 2525 return as_ucs4(string, NULL, 0, 1); 2526 } 2527 2528 /* maximum number of characters required for output of %lld or %p. 2529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2532 2533 static int 2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2535 Py_ssize_t width, Py_ssize_t precision) 2536 { 2537 Py_ssize_t length, fill, arglen; 2538 Py_UCS4 maxchar; 2539 2540 if (PyUnicode_READY(str) == -1) 2541 return -1; 2542 2543 length = PyUnicode_GET_LENGTH(str); 2544 if ((precision == -1 || precision >= length) 2545 && width <= length) 2546 return _PyUnicodeWriter_WriteStr(writer, str); 2547 2548 if (precision != -1) 2549 length = Py_MIN(precision, length); 2550 2551 arglen = Py_MAX(length, width); 2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2554 else 2555 maxchar = writer->maxchar; 2556 2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2558 return -1; 2559 2560 if (width > length) { 2561 fill = width - length; 2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2563 return -1; 2564 writer->pos += fill; 2565 } 2566 2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2568 str, 0, length); 2569 writer->pos += length; 2570 return 0; 2571 } 2572 2573 static int 2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2575 Py_ssize_t width, Py_ssize_t precision) 2576 { 2577 /* UTF-8 */ 2578 Py_ssize_t length; 2579 PyObject *unicode; 2580 int res; 2581 2582 if (precision == -1) { 2583 length = strlen(str); 2584 } 2585 else { 2586 length = 0; 2587 while (length < precision && str[length]) { 2588 length++; 2589 } 2590 } 2591 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2592 if (unicode == NULL) 2593 return -1; 2594 2595 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2596 Py_DECREF(unicode); 2597 return res; 2598 } 2599 2600 static const char* 2601 unicode_fromformat_arg(_PyUnicodeWriter *writer, 2602 const char *f, va_list *vargs) 2603 { 2604 const char *p; 2605 Py_ssize_t len; 2606 int zeropad; 2607 Py_ssize_t width; 2608 Py_ssize_t precision; 2609 int longflag; 2610 int longlongflag; 2611 int size_tflag; 2612 Py_ssize_t fill; 2613 2614 p = f; 2615 f++; 2616 zeropad = 0; 2617 if (*f == '0') { 2618 zeropad = 1; 2619 f++; 2620 } 2621 2622 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2623 width = -1; 2624 if (Py_ISDIGIT((unsigned)*f)) { 2625 width = *f - '0'; 2626 f++; 2627 while (Py_ISDIGIT((unsigned)*f)) { 2628 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2629 PyErr_SetString(PyExc_ValueError, 2630 "width too big"); 2631 return NULL; 2632 } 2633 width = (width * 10) + (*f - '0'); 2634 f++; 2635 } 2636 } 2637 precision = -1; 2638 if (*f == '.') { 2639 f++; 2640 if (Py_ISDIGIT((unsigned)*f)) { 2641 precision = (*f - '0'); 2642 f++; 2643 while (Py_ISDIGIT((unsigned)*f)) { 2644 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2645 PyErr_SetString(PyExc_ValueError, 2646 "precision too big"); 2647 return NULL; 2648 } 2649 precision = (precision * 10) + (*f - '0'); 2650 f++; 2651 } 2652 } 2653 if (*f == '%') { 2654 /* "%.3%s" => f points to "3" */ 2655 f--; 2656 } 2657 } 2658 if (*f == '\0') { 2659 /* bogus format "%.123" => go backward, f points to "3" */ 2660 f--; 2661 } 2662 2663 /* Handle %ld, %lu, %lld and %llu. */ 2664 longflag = 0; 2665 longlongflag = 0; 2666 size_tflag = 0; 2667 if (*f == 'l') { 2668 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2669 longflag = 1; 2670 ++f; 2671 } 2672 else if (f[1] == 'l' && 2673 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2674 longlongflag = 1; 2675 f += 2; 2676 } 2677 } 2678 /* handle the size_t flag. */ 2679 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2680 size_tflag = 1; 2681 ++f; 2682 } 2683 2684 if (f[1] == '\0') 2685 writer->overallocate = 0; 2686 2687 switch (*f) { 2688 case 'c': 2689 { 2690 int ordinal = va_arg(*vargs, int); 2691 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2692 PyErr_SetString(PyExc_OverflowError, 2693 "character argument not in range(0x110000)"); 2694 return NULL; 2695 } 2696 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2697 return NULL; 2698 break; 2699 } 2700 2701 case 'i': 2702 case 'd': 2703 case 'u': 2704 case 'x': 2705 { 2706 /* used by sprintf */ 2707 char buffer[MAX_LONG_LONG_CHARS]; 2708 Py_ssize_t arglen; 2709 2710 if (*f == 'u') { 2711 if (longflag) 2712 len = sprintf(buffer, "%lu", 2713 va_arg(*vargs, unsigned long)); 2714 else if (longlongflag) 2715 len = sprintf(buffer, "%llu", 2716 va_arg(*vargs, unsigned long long)); 2717 else if (size_tflag) 2718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2719 va_arg(*vargs, size_t)); 2720 else 2721 len = sprintf(buffer, "%u", 2722 va_arg(*vargs, unsigned int)); 2723 } 2724 else if (*f == 'x') { 2725 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2726 } 2727 else { 2728 if (longflag) 2729 len = sprintf(buffer, "%li", 2730 va_arg(*vargs, long)); 2731 else if (longlongflag) 2732 len = sprintf(buffer, "%lli", 2733 va_arg(*vargs, long long)); 2734 else if (size_tflag) 2735 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2736 va_arg(*vargs, Py_ssize_t)); 2737 else 2738 len = sprintf(buffer, "%i", 2739 va_arg(*vargs, int)); 2740 } 2741 assert(len >= 0); 2742 2743 if (precision < len) 2744 precision = len; 2745 2746 arglen = Py_MAX(precision, width); 2747 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2748 return NULL; 2749 2750 if (width > precision) { 2751 Py_UCS4 fillchar; 2752 fill = width - precision; 2753 fillchar = zeropad?'0':' '; 2754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2755 return NULL; 2756 writer->pos += fill; 2757 } 2758 if (precision > len) { 2759 fill = precision - len; 2760 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2761 return NULL; 2762 writer->pos += fill; 2763 } 2764 2765 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2766 return NULL; 2767 break; 2768 } 2769 2770 case 'p': 2771 { 2772 char number[MAX_LONG_LONG_CHARS]; 2773 2774 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2775 assert(len >= 0); 2776 2777 /* %p is ill-defined: ensure leading 0x. */ 2778 if (number[1] == 'X') 2779 number[1] = 'x'; 2780 else if (number[1] != 'x') { 2781 memmove(number + 2, number, 2782 strlen(number) + 1); 2783 number[0] = '0'; 2784 number[1] = 'x'; 2785 len += 2; 2786 } 2787 2788 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2789 return NULL; 2790 break; 2791 } 2792 2793 case 's': 2794 { 2795 /* UTF-8 */ 2796 const char *s = va_arg(*vargs, const char*); 2797 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2798 return NULL; 2799 break; 2800 } 2801 2802 case 'U': 2803 { 2804 PyObject *obj = va_arg(*vargs, PyObject *); 2805 assert(obj && _PyUnicode_CHECK(obj)); 2806 2807 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2808 return NULL; 2809 break; 2810 } 2811 2812 case 'V': 2813 { 2814 PyObject *obj = va_arg(*vargs, PyObject *); 2815 const char *str = va_arg(*vargs, const char *); 2816 if (obj) { 2817 assert(_PyUnicode_CHECK(obj)); 2818 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2819 return NULL; 2820 } 2821 else { 2822 assert(str != NULL); 2823 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2824 return NULL; 2825 } 2826 break; 2827 } 2828 2829 case 'S': 2830 { 2831 PyObject *obj = va_arg(*vargs, PyObject *); 2832 PyObject *str; 2833 assert(obj); 2834 str = PyObject_Str(obj); 2835 if (!str) 2836 return NULL; 2837 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2838 Py_DECREF(str); 2839 return NULL; 2840 } 2841 Py_DECREF(str); 2842 break; 2843 } 2844 2845 case 'R': 2846 { 2847 PyObject *obj = va_arg(*vargs, PyObject *); 2848 PyObject *repr; 2849 assert(obj); 2850 repr = PyObject_Repr(obj); 2851 if (!repr) 2852 return NULL; 2853 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2854 Py_DECREF(repr); 2855 return NULL; 2856 } 2857 Py_DECREF(repr); 2858 break; 2859 } 2860 2861 case 'A': 2862 { 2863 PyObject *obj = va_arg(*vargs, PyObject *); 2864 PyObject *ascii; 2865 assert(obj); 2866 ascii = PyObject_ASCII(obj); 2867 if (!ascii) 2868 return NULL; 2869 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2870 Py_DECREF(ascii); 2871 return NULL; 2872 } 2873 Py_DECREF(ascii); 2874 break; 2875 } 2876 2877 case '%': 2878 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2879 return NULL; 2880 break; 2881 2882 default: 2883 /* if we stumble upon an unknown formatting code, copy the rest 2884 of the format string to the output string. (we cannot just 2885 skip the code, since there's no way to know what's in the 2886 argument list) */ 2887 len = strlen(p); 2888 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2889 return NULL; 2890 f = p+len; 2891 return f; 2892 } 2893 2894 f++; 2895 return f; 2896 } 2897 2898 PyObject * 2899 PyUnicode_FromFormatV(const char *format, va_list vargs) 2900 { 2901 va_list vargs2; 2902 const char *f; 2903 _PyUnicodeWriter writer; 2904 2905 _PyUnicodeWriter_Init(&writer); 2906 writer.min_length = strlen(format) + 100; 2907 writer.overallocate = 1; 2908 2909 // Copy varags to be able to pass a reference to a subfunction. 2910 va_copy(vargs2, vargs); 2911 2912 for (f = format; *f; ) { 2913 if (*f == '%') { 2914 f = unicode_fromformat_arg(&writer, f, &vargs2); 2915 if (f == NULL) 2916 goto fail; 2917 } 2918 else { 2919 const char *p; 2920 Py_ssize_t len; 2921 2922 p = f; 2923 do 2924 { 2925 if ((unsigned char)*p > 127) { 2926 PyErr_Format(PyExc_ValueError, 2927 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2928 "string, got a non-ASCII byte: 0x%02x", 2929 (unsigned char)*p); 2930 goto fail; 2931 } 2932 p++; 2933 } 2934 while (*p != '\0' && *p != '%'); 2935 len = p - f; 2936 2937 if (*p == '\0') 2938 writer.overallocate = 0; 2939 2940 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2941 goto fail; 2942 2943 f = p; 2944 } 2945 } 2946 va_end(vargs2); 2947 return _PyUnicodeWriter_Finish(&writer); 2948 2949 fail: 2950 va_end(vargs2); 2951 _PyUnicodeWriter_Dealloc(&writer); 2952 return NULL; 2953 } 2954 2955 PyObject * 2956 PyUnicode_FromFormat(const char *format, ...) 2957 { 2958 PyObject* ret; 2959 va_list vargs; 2960 2961 #ifdef HAVE_STDARG_PROTOTYPES 2962 va_start(vargs, format); 2963 #else 2964 va_start(vargs); 2965 #endif 2966 ret = PyUnicode_FromFormatV(format, vargs); 2967 va_end(vargs); 2968 return ret; 2969 } 2970 2971 #ifdef HAVE_WCHAR_H 2972 2973 /* Convert a Unicode object to a wide character string. 2974 2975 - If w is NULL: return the number of wide characters (including the null 2976 character) required to convert the unicode object. Ignore size argument. 2977 2978 - Otherwise: return the number of wide characters (excluding the null 2979 character) written into w. Write at most size wide characters (including 2980 the null character). */ 2981 Py_ssize_t 2982 PyUnicode_AsWideChar(PyObject *unicode, 2983 wchar_t *w, 2984 Py_ssize_t size) 2985 { 2986 Py_ssize_t res; 2987 const wchar_t *wstr; 2988 2989 if (unicode == NULL) { 2990 PyErr_BadInternalCall(); 2991 return -1; 2992 } 2993 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2994 if (wstr == NULL) 2995 return -1; 2996 2997 if (w != NULL) { 2998 if (size > res) 2999 size = res + 1; 3000 else 3001 res = size; 3002 memcpy(w, wstr, size * sizeof(wchar_t)); 3003 return res; 3004 } 3005 else 3006 return res + 1; 3007 } 3008 3009 wchar_t* 3010 PyUnicode_AsWideCharString(PyObject *unicode, 3011 Py_ssize_t *size) 3012 { 3013 const wchar_t *wstr; 3014 wchar_t *buffer; 3015 Py_ssize_t buflen; 3016 3017 if (unicode == NULL) { 3018 PyErr_BadInternalCall(); 3019 return NULL; 3020 } 3021 3022 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen); 3023 if (wstr == NULL) { 3024 return NULL; 3025 } 3026 if (size == NULL && wcslen(wstr) != (size_t)buflen) { 3027 PyErr_SetString(PyExc_ValueError, 3028 "embedded null character"); 3029 return NULL; 3030 } 3031 3032 buffer = PyMem_NEW(wchar_t, buflen + 1); 3033 if (buffer == NULL) { 3034 PyErr_NoMemory(); 3035 return NULL; 3036 } 3037 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t)); 3038 if (size != NULL) 3039 *size = buflen; 3040 return buffer; 3041 } 3042 3043 #endif /* HAVE_WCHAR_H */ 3044 3045 PyObject * 3046 PyUnicode_FromOrdinal(int ordinal) 3047 { 3048 if (ordinal < 0 || ordinal > MAX_UNICODE) { 3049 PyErr_SetString(PyExc_ValueError, 3050 "chr() arg not in range(0x110000)"); 3051 return NULL; 3052 } 3053 3054 return unicode_char((Py_UCS4)ordinal); 3055 } 3056 3057 PyObject * 3058 PyUnicode_FromObject(PyObject *obj) 3059 { 3060 /* XXX Perhaps we should make this API an alias of 3061 PyObject_Str() instead ?! */ 3062 if (PyUnicode_CheckExact(obj)) { 3063 if (PyUnicode_READY(obj) == -1) 3064 return NULL; 3065 Py_INCREF(obj); 3066 return obj; 3067 } 3068 if (PyUnicode_Check(obj)) { 3069 /* For a Unicode subtype that's not a Unicode object, 3070 return a true Unicode object with the same data. */ 3071 return _PyUnicode_Copy(obj); 3072 } 3073 PyErr_Format(PyExc_TypeError, 3074 "Can't convert '%.100s' object to str implicitly", 3075 Py_TYPE(obj)->tp_name); 3076 return NULL; 3077 } 3078 3079 PyObject * 3080 PyUnicode_FromEncodedObject(PyObject *obj, 3081 const char *encoding, 3082 const char *errors) 3083 { 3084 Py_buffer buffer; 3085 PyObject *v; 3086 3087 if (obj == NULL) { 3088 PyErr_BadInternalCall(); 3089 return NULL; 3090 } 3091 3092 /* Decoding bytes objects is the most common case and should be fast */ 3093 if (PyBytes_Check(obj)) { 3094 if (PyBytes_GET_SIZE(obj) == 0) 3095 _Py_RETURN_UNICODE_EMPTY(); 3096 v = PyUnicode_Decode( 3097 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3098 encoding, errors); 3099 return v; 3100 } 3101 3102 if (PyUnicode_Check(obj)) { 3103 PyErr_SetString(PyExc_TypeError, 3104 "decoding str is not supported"); 3105 return NULL; 3106 } 3107 3108 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3109 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3110 PyErr_Format(PyExc_TypeError, 3111 "decoding to str: need a bytes-like object, %.80s found", 3112 Py_TYPE(obj)->tp_name); 3113 return NULL; 3114 } 3115 3116 if (buffer.len == 0) { 3117 PyBuffer_Release(&buffer); 3118 _Py_RETURN_UNICODE_EMPTY(); 3119 } 3120 3121 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3122 PyBuffer_Release(&buffer); 3123 return v; 3124 } 3125 3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but 3127 also convert to lowercase. Return 1 on success, or 0 on error (encoding is 3128 longer than lower_len-1). */ 3129 int 3130 _Py_normalize_encoding(const char *encoding, 3131 char *lower, 3132 size_t lower_len) 3133 { 3134 const char *e; 3135 char *l; 3136 char *l_end; 3137 int punct; 3138 3139 assert(encoding != NULL); 3140 3141 e = encoding; 3142 l = lower; 3143 l_end = &lower[lower_len - 1]; 3144 punct = 0; 3145 while (1) { 3146 char c = *e; 3147 if (c == 0) { 3148 break; 3149 } 3150 3151 if (Py_ISALNUM(c) || c == '.') { 3152 if (punct && l != lower) { 3153 if (l == l_end) { 3154 return 0; 3155 } 3156 *l++ = '_'; 3157 } 3158 punct = 0; 3159 3160 if (l == l_end) { 3161 return 0; 3162 } 3163 *l++ = Py_TOLOWER(c); 3164 } 3165 else { 3166 punct = 1; 3167 } 3168 3169 e++; 3170 } 3171 *l = '\0'; 3172 return 1; 3173 } 3174 3175 PyObject * 3176 PyUnicode_Decode(const char *s, 3177 Py_ssize_t size, 3178 const char *encoding, 3179 const char *errors) 3180 { 3181 PyObject *buffer = NULL, *unicode; 3182 Py_buffer info; 3183 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */ 3184 3185 if (encoding == NULL) { 3186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3187 } 3188 3189 /* Shortcuts for common default encodings */ 3190 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3191 char *lower = buflower; 3192 3193 /* Fast paths */ 3194 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3195 lower += 3; 3196 if (*lower == '_') { 3197 /* Match "utf8" and "utf_8" */ 3198 lower++; 3199 } 3200 3201 if (lower[0] == '8' && lower[1] == 0) { 3202 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3203 } 3204 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3205 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3206 } 3207 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3208 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3209 } 3210 } 3211 else { 3212 if (strcmp(lower, "ascii") == 0 3213 || strcmp(lower, "us_ascii") == 0) { 3214 return PyUnicode_DecodeASCII(s, size, errors); 3215 } 3216 #ifdef MS_WINDOWS 3217 else if (strcmp(lower, "mbcs") == 0) { 3218 return PyUnicode_DecodeMBCS(s, size, errors); 3219 } 3220 #endif 3221 else if (strcmp(lower, "latin1") == 0 3222 || strcmp(lower, "latin_1") == 0 3223 || strcmp(lower, "iso_8859_1") == 0 3224 || strcmp(lower, "iso8859_1") == 0) { 3225 return PyUnicode_DecodeLatin1(s, size, errors); 3226 } 3227 } 3228 } 3229 3230 /* Decode via the codec registry */ 3231 buffer = NULL; 3232 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3233 goto onError; 3234 buffer = PyMemoryView_FromBuffer(&info); 3235 if (buffer == NULL) 3236 goto onError; 3237 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3238 if (unicode == NULL) 3239 goto onError; 3240 if (!PyUnicode_Check(unicode)) { 3241 PyErr_Format(PyExc_TypeError, 3242 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3243 "use codecs.decode() to decode to arbitrary types", 3244 encoding, 3245 Py_TYPE(unicode)->tp_name); 3246 Py_DECREF(unicode); 3247 goto onError; 3248 } 3249 Py_DECREF(buffer); 3250 return unicode_result(unicode); 3251 3252 onError: 3253 Py_XDECREF(buffer); 3254 return NULL; 3255 } 3256 3257 PyObject * 3258 PyUnicode_AsDecodedObject(PyObject *unicode, 3259 const char *encoding, 3260 const char *errors) 3261 { 3262 if (!PyUnicode_Check(unicode)) { 3263 PyErr_BadArgument(); 3264 return NULL; 3265 } 3266 3267 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3268 "PyUnicode_AsDecodedObject() is deprecated; " 3269 "use PyCodec_Decode() to decode from str", 1) < 0) 3270 return NULL; 3271 3272 if (encoding == NULL) 3273 encoding = PyUnicode_GetDefaultEncoding(); 3274 3275 /* Decode via the codec registry */ 3276 return PyCodec_Decode(unicode, encoding, errors); 3277 } 3278 3279 PyObject * 3280 PyUnicode_AsDecodedUnicode(PyObject *unicode, 3281 const char *encoding, 3282 const char *errors) 3283 { 3284 PyObject *v; 3285 3286 if (!PyUnicode_Check(unicode)) { 3287 PyErr_BadArgument(); 3288 goto onError; 3289 } 3290 3291 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3292 "PyUnicode_AsDecodedUnicode() is deprecated; " 3293 "use PyCodec_Decode() to decode from str to str", 1) < 0) 3294 return NULL; 3295 3296 if (encoding == NULL) 3297 encoding = PyUnicode_GetDefaultEncoding(); 3298 3299 /* Decode via the codec registry */ 3300 v = PyCodec_Decode(unicode, encoding, errors); 3301 if (v == NULL) 3302 goto onError; 3303 if (!PyUnicode_Check(v)) { 3304 PyErr_Format(PyExc_TypeError, 3305 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3306 "use codecs.decode() to decode to arbitrary types", 3307 encoding, 3308 Py_TYPE(unicode)->tp_name); 3309 Py_DECREF(v); 3310 goto onError; 3311 } 3312 return unicode_result(v); 3313 3314 onError: 3315 return NULL; 3316 } 3317 3318 PyObject * 3319 PyUnicode_Encode(const Py_UNICODE *s, 3320 Py_ssize_t size, 3321 const char *encoding, 3322 const char *errors) 3323 { 3324 PyObject *v, *unicode; 3325 3326 unicode = PyUnicode_FromWideChar(s, size); 3327 if (unicode == NULL) 3328 return NULL; 3329 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3330 Py_DECREF(unicode); 3331 return v; 3332 } 3333 3334 PyObject * 3335 PyUnicode_AsEncodedObject(PyObject *unicode, 3336 const char *encoding, 3337 const char *errors) 3338 { 3339 PyObject *v; 3340 3341 if (!PyUnicode_Check(unicode)) { 3342 PyErr_BadArgument(); 3343 goto onError; 3344 } 3345 3346 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3347 "PyUnicode_AsEncodedObject() is deprecated; " 3348 "use PyUnicode_AsEncodedString() to encode from str to bytes " 3349 "or PyCodec_Encode() for generic encoding", 1) < 0) 3350 return NULL; 3351 3352 if (encoding == NULL) 3353 encoding = PyUnicode_GetDefaultEncoding(); 3354 3355 /* Encode via the codec registry */ 3356 v = PyCodec_Encode(unicode, encoding, errors); 3357 if (v == NULL) 3358 goto onError; 3359 return v; 3360 3361 onError: 3362 return NULL; 3363 } 3364 3365 static int 3366 locale_error_handler(const char *errors, int *surrogateescape) 3367 { 3368 _Py_error_handler error_handler = get_error_handler(errors); 3369 switch (error_handler) 3370 { 3371 case _Py_ERROR_STRICT: 3372 *surrogateescape = 0; 3373 return 0; 3374 case _Py_ERROR_SURROGATEESCAPE: 3375 *surrogateescape = 1; 3376 return 0; 3377 default: 3378 PyErr_Format(PyExc_ValueError, 3379 "only 'strict' and 'surrogateescape' error handlers " 3380 "are supported, not '%s'", 3381 errors); 3382 return -1; 3383 } 3384 } 3385 3386 static PyObject * 3387 unicode_encode_locale(PyObject *unicode, const char *errors, 3388 int current_locale) 3389 { 3390 int surrogateescape; 3391 if (locale_error_handler(errors, &surrogateescape) < 0) 3392 return NULL; 3393 3394 Py_ssize_t wlen; 3395 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3396 if (wstr == NULL) { 3397 return NULL; 3398 } 3399 3400 if ((size_t)wlen != wcslen(wstr)) { 3401 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3402 PyMem_Free(wstr); 3403 return NULL; 3404 } 3405 3406 char *str; 3407 size_t error_pos; 3408 const char *reason; 3409 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, 3410 current_locale, surrogateescape); 3411 PyMem_Free(wstr); 3412 3413 if (res != 0) { 3414 if (res == -2) { 3415 PyObject *exc; 3416 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", 3417 "locale", unicode, 3418 (Py_ssize_t)error_pos, 3419 (Py_ssize_t)(error_pos+1), 3420 reason); 3421 if (exc != NULL) { 3422 PyCodec_StrictErrors(exc); 3423 Py_DECREF(exc); 3424 } 3425 } 3426 else { 3427 PyErr_NoMemory(); 3428 } 3429 return NULL; 3430 } 3431 3432 PyObject *bytes = PyBytes_FromString(str); 3433 PyMem_RawFree(str); 3434 return bytes; 3435 } 3436 3437 PyObject * 3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3439 { 3440 return unicode_encode_locale(unicode, errors, 1); 3441 } 3442 3443 PyObject * 3444 PyUnicode_EncodeFSDefault(PyObject *unicode) 3445 { 3446 #if defined(__APPLE__) 3447 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors); 3448 #else 3449 PyInterpreterState *interp = PyThreadState_GET()->interp; 3450 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3451 cannot use it to encode and decode filenames before it is loaded. Load 3452 the Python codec requires to encode at least its own filename. Use the C 3453 version of the locale codec until the codec registry is initialized and 3454 the Python codec is loaded. 3455 3456 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3457 cannot only rely on it: check also interp->fscodec_initialized for 3458 subinterpreters. */ 3459 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3460 return PyUnicode_AsEncodedString(unicode, 3461 Py_FileSystemDefaultEncoding, 3462 Py_FileSystemDefaultEncodeErrors); 3463 } 3464 else { 3465 return unicode_encode_locale(unicode, 3466 Py_FileSystemDefaultEncodeErrors, 0); 3467 } 3468 #endif 3469 } 3470 3471 PyObject * 3472 PyUnicode_AsEncodedString(PyObject *unicode, 3473 const char *encoding, 3474 const char *errors) 3475 { 3476 PyObject *v; 3477 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */ 3478 3479 if (!PyUnicode_Check(unicode)) { 3480 PyErr_BadArgument(); 3481 return NULL; 3482 } 3483 3484 if (encoding == NULL) { 3485 return _PyUnicode_AsUTF8String(unicode, errors); 3486 } 3487 3488 /* Shortcuts for common default encodings */ 3489 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { 3490 char *lower = buflower; 3491 3492 /* Fast paths */ 3493 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') { 3494 lower += 3; 3495 if (*lower == '_') { 3496 /* Match "utf8" and "utf_8" */ 3497 lower++; 3498 } 3499 3500 if (lower[0] == '8' && lower[1] == 0) { 3501 return _PyUnicode_AsUTF8String(unicode, errors); 3502 } 3503 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) { 3504 return _PyUnicode_EncodeUTF16(unicode, errors, 0); 3505 } 3506 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) { 3507 return _PyUnicode_EncodeUTF32(unicode, errors, 0); 3508 } 3509 } 3510 else { 3511 if (strcmp(lower, "ascii") == 0 3512 || strcmp(lower, "us_ascii") == 0) { 3513 return _PyUnicode_AsASCIIString(unicode, errors); 3514 } 3515 #ifdef MS_WINDOWS 3516 else if (strcmp(lower, "mbcs") == 0) { 3517 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3518 } 3519 #endif 3520 else if (strcmp(lower, "latin1") == 0 || 3521 strcmp(lower, "latin_1") == 0 || 3522 strcmp(lower, "iso_8859_1") == 0 || 3523 strcmp(lower, "iso8859_1") == 0) { 3524 return _PyUnicode_AsLatin1String(unicode, errors); 3525 } 3526 } 3527 } 3528 3529 /* Encode via the codec registry */ 3530 v = _PyCodec_EncodeText(unicode, encoding, errors); 3531 if (v == NULL) 3532 return NULL; 3533 3534 /* The normal path */ 3535 if (PyBytes_Check(v)) 3536 return v; 3537 3538 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3539 if (PyByteArray_Check(v)) { 3540 int error; 3541 PyObject *b; 3542 3543 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3544 "encoder %s returned bytearray instead of bytes; " 3545 "use codecs.encode() to encode to arbitrary types", 3546 encoding); 3547 if (error) { 3548 Py_DECREF(v); 3549 return NULL; 3550 } 3551 3552 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), 3553 PyByteArray_GET_SIZE(v)); 3554 Py_DECREF(v); 3555 return b; 3556 } 3557 3558 PyErr_Format(PyExc_TypeError, 3559 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3560 "use codecs.encode() to encode to arbitrary types", 3561 encoding, 3562 Py_TYPE(v)->tp_name); 3563 Py_DECREF(v); 3564 return NULL; 3565 } 3566 3567 PyObject * 3568 PyUnicode_AsEncodedUnicode(PyObject *unicode, 3569 const char *encoding, 3570 const char *errors) 3571 { 3572 PyObject *v; 3573 3574 if (!PyUnicode_Check(unicode)) { 3575 PyErr_BadArgument(); 3576 goto onError; 3577 } 3578 3579 if (PyErr_WarnEx(PyExc_DeprecationWarning, 3580 "PyUnicode_AsEncodedUnicode() is deprecated; " 3581 "use PyCodec_Encode() to encode from str to str", 1) < 0) 3582 return NULL; 3583 3584 if (encoding == NULL) 3585 encoding = PyUnicode_GetDefaultEncoding(); 3586 3587 /* Encode via the codec registry */ 3588 v = PyCodec_Encode(unicode, encoding, errors); 3589 if (v == NULL) 3590 goto onError; 3591 if (!PyUnicode_Check(v)) { 3592 PyErr_Format(PyExc_TypeError, 3593 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3594 "use codecs.encode() to encode to arbitrary types", 3595 encoding, 3596 Py_TYPE(v)->tp_name); 3597 Py_DECREF(v); 3598 goto onError; 3599 } 3600 return v; 3601 3602 onError: 3603 return NULL; 3604 } 3605 3606 static PyObject* 3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, 3608 int current_locale) 3609 { 3610 int surrogateescape; 3611 if (locale_error_handler(errors, &surrogateescape) < 0) 3612 return NULL; 3613 3614 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3615 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3616 return NULL; 3617 } 3618 3619 wchar_t *wstr; 3620 size_t wlen; 3621 const char *reason; 3622 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 3623 current_locale, surrogateescape); 3624 if (res != 0) { 3625 if (res == -2) { 3626 PyObject *exc; 3627 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", 3628 "locale", str, len, 3629 (Py_ssize_t)wlen, 3630 (Py_ssize_t)(wlen + 1), 3631 reason); 3632 if (exc != NULL) { 3633 PyCodec_StrictErrors(exc); 3634 Py_DECREF(exc); 3635 } 3636 } 3637 else { 3638 PyErr_NoMemory(); 3639 } 3640 return NULL; 3641 } 3642 3643 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen); 3644 PyMem_RawFree(wstr); 3645 return unicode; 3646 } 3647 3648 PyObject* 3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3650 const char *errors) 3651 { 3652 return unicode_decode_locale(str, len, errors, 1); 3653 } 3654 3655 PyObject* 3656 PyUnicode_DecodeLocale(const char *str, const char *errors) 3657 { 3658 Py_ssize_t size = (Py_ssize_t)strlen(str); 3659 return unicode_decode_locale(str, size, errors, 1); 3660 } 3661 3662 3663 PyObject* 3664 PyUnicode_DecodeFSDefault(const char *s) { 3665 Py_ssize_t size = (Py_ssize_t)strlen(s); 3666 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3667 } 3668 3669 PyObject* 3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3671 { 3672 #if defined(__APPLE__) 3673 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL); 3674 #else 3675 PyInterpreterState *interp = PyThreadState_GET()->interp; 3676 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3677 cannot use it to encode and decode filenames before it is loaded. Load 3678 the Python codec requires to encode at least its own filename. Use the C 3679 version of the locale codec until the codec registry is initialized and 3680 the Python codec is loaded. 3681 3682 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3683 cannot only rely on it: check also interp->fscodec_initialized for 3684 subinterpreters. */ 3685 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3686 return PyUnicode_Decode(s, size, 3687 Py_FileSystemDefaultEncoding, 3688 Py_FileSystemDefaultEncodeErrors); 3689 } 3690 else { 3691 return unicode_decode_locale(s, size, 3692 Py_FileSystemDefaultEncodeErrors, 0); 3693 } 3694 #endif 3695 } 3696 3697 3698 int 3699 PyUnicode_FSConverter(PyObject* arg, void* addr) 3700 { 3701 PyObject *path = NULL; 3702 PyObject *output = NULL; 3703 Py_ssize_t size; 3704 void *data; 3705 if (arg == NULL) { 3706 Py_DECREF(*(PyObject**)addr); 3707 *(PyObject**)addr = NULL; 3708 return 1; 3709 } 3710 path = PyOS_FSPath(arg); 3711 if (path == NULL) { 3712 return 0; 3713 } 3714 if (PyBytes_Check(path)) { 3715 output = path; 3716 } 3717 else { // PyOS_FSPath() guarantees its returned value is bytes or str. 3718 output = PyUnicode_EncodeFSDefault(path); 3719 Py_DECREF(path); 3720 if (!output) { 3721 return 0; 3722 } 3723 assert(PyBytes_Check(output)); 3724 } 3725 3726 size = PyBytes_GET_SIZE(output); 3727 data = PyBytes_AS_STRING(output); 3728 if ((size_t)size != strlen(data)) { 3729 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3730 Py_DECREF(output); 3731 return 0; 3732 } 3733 *(PyObject**)addr = output; 3734 return Py_CLEANUP_SUPPORTED; 3735 } 3736 3737 3738 int 3739 PyUnicode_FSDecoder(PyObject* arg, void* addr) 3740 { 3741 int is_buffer = 0; 3742 PyObject *path = NULL; 3743 PyObject *output = NULL; 3744 if (arg == NULL) { 3745 Py_DECREF(*(PyObject**)addr); 3746 *(PyObject**)addr = NULL; 3747 return 1; 3748 } 3749 3750 is_buffer = PyObject_CheckBuffer(arg); 3751 if (!is_buffer) { 3752 path = PyOS_FSPath(arg); 3753 if (path == NULL) { 3754 return 0; 3755 } 3756 } 3757 else { 3758 path = arg; 3759 Py_INCREF(arg); 3760 } 3761 3762 if (PyUnicode_Check(path)) { 3763 if (PyUnicode_READY(path) == -1) { 3764 Py_DECREF(path); 3765 return 0; 3766 } 3767 output = path; 3768 } 3769 else if (PyBytes_Check(path) || is_buffer) { 3770 PyObject *path_bytes = NULL; 3771 3772 if (!PyBytes_Check(path) && 3773 PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 3774 "path should be string, bytes, or os.PathLike, not %.200s", 3775 Py_TYPE(arg)->tp_name)) { 3776 Py_DECREF(path); 3777 return 0; 3778 } 3779 path_bytes = PyBytes_FromObject(path); 3780 Py_DECREF(path); 3781 if (!path_bytes) { 3782 return 0; 3783 } 3784 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), 3785 PyBytes_GET_SIZE(path_bytes)); 3786 Py_DECREF(path_bytes); 3787 if (!output) { 3788 return 0; 3789 } 3790 } 3791 else { 3792 PyErr_Format(PyExc_TypeError, 3793 "path should be string, bytes, or os.PathLike, not %.200s", 3794 Py_TYPE(arg)->tp_name); 3795 Py_DECREF(path); 3796 return 0; 3797 } 3798 if (PyUnicode_READY(output) == -1) { 3799 Py_DECREF(output); 3800 return 0; 3801 } 3802 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3803 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3804 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3805 Py_DECREF(output); 3806 return 0; 3807 } 3808 *(PyObject**)addr = output; 3809 return Py_CLEANUP_SUPPORTED; 3810 } 3811 3812 3813 const char * 3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3815 { 3816 PyObject *bytes; 3817 3818 if (!PyUnicode_Check(unicode)) { 3819 PyErr_BadArgument(); 3820 return NULL; 3821 } 3822 if (PyUnicode_READY(unicode) == -1) 3823 return NULL; 3824 3825 if (PyUnicode_UTF8(unicode) == NULL) { 3826 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3827 bytes = _PyUnicode_AsUTF8String(unicode, NULL); 3828 if (bytes == NULL) 3829 return NULL; 3830 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3831 if (_PyUnicode_UTF8(unicode) == NULL) { 3832 PyErr_NoMemory(); 3833 Py_DECREF(bytes); 3834 return NULL; 3835 } 3836 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3837 memcpy(_PyUnicode_UTF8(unicode), 3838 PyBytes_AS_STRING(bytes), 3839 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3840 Py_DECREF(bytes); 3841 } 3842 3843 if (psize) 3844 *psize = PyUnicode_UTF8_LENGTH(unicode); 3845 return PyUnicode_UTF8(unicode); 3846 } 3847 3848 const char * 3849 PyUnicode_AsUTF8(PyObject *unicode) 3850 { 3851 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3852 } 3853 3854 Py_UNICODE * 3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3856 { 3857 const unsigned char *one_byte; 3858 #if SIZEOF_WCHAR_T == 4 3859 const Py_UCS2 *two_bytes; 3860 #else 3861 const Py_UCS4 *four_bytes; 3862 const Py_UCS4 *ucs4_end; 3863 Py_ssize_t num_surrogates; 3864 #endif 3865 wchar_t *w; 3866 wchar_t *wchar_end; 3867 3868 if (!PyUnicode_Check(unicode)) { 3869 PyErr_BadArgument(); 3870 return NULL; 3871 } 3872 if (_PyUnicode_WSTR(unicode) == NULL) { 3873 /* Non-ASCII compact unicode object */ 3874 assert(_PyUnicode_KIND(unicode) != 0); 3875 assert(PyUnicode_IS_READY(unicode)); 3876 3877 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3878 #if SIZEOF_WCHAR_T == 2 3879 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3880 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3881 num_surrogates = 0; 3882 3883 for (; four_bytes < ucs4_end; ++four_bytes) { 3884 if (*four_bytes > 0xFFFF) 3885 ++num_surrogates; 3886 } 3887 3888 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3889 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3890 if (!_PyUnicode_WSTR(unicode)) { 3891 PyErr_NoMemory(); 3892 return NULL; 3893 } 3894 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3895 3896 w = _PyUnicode_WSTR(unicode); 3897 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3898 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3899 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3900 if (*four_bytes > 0xFFFF) { 3901 assert(*four_bytes <= MAX_UNICODE); 3902 /* encode surrogate pair in this case */ 3903 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3904 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3905 } 3906 else 3907 *w = *four_bytes; 3908 3909 if (w > wchar_end) { 3910 Py_UNREACHABLE(); 3911 } 3912 } 3913 *w = 0; 3914 #else 3915 /* sizeof(wchar_t) == 4 */ 3916 Py_FatalError("Impossible unicode object state, wstr and str " 3917 "should share memory already."); 3918 return NULL; 3919 #endif 3920 } 3921 else { 3922 if ((size_t)_PyUnicode_LENGTH(unicode) > 3923 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 3924 PyErr_NoMemory(); 3925 return NULL; 3926 } 3927 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3928 (_PyUnicode_LENGTH(unicode) + 1)); 3929 if (!_PyUnicode_WSTR(unicode)) { 3930 PyErr_NoMemory(); 3931 return NULL; 3932 } 3933 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3934 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3935 w = _PyUnicode_WSTR(unicode); 3936 wchar_end = w + _PyUnicode_LENGTH(unicode); 3937 3938 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3939 one_byte = PyUnicode_1BYTE_DATA(unicode); 3940 for (; w < wchar_end; ++one_byte, ++w) 3941 *w = *one_byte; 3942 /* null-terminate the wstr */ 3943 *w = 0; 3944 } 3945 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3946 #if SIZEOF_WCHAR_T == 4 3947 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3948 for (; w < wchar_end; ++two_bytes, ++w) 3949 *w = *two_bytes; 3950 /* null-terminate the wstr */ 3951 *w = 0; 3952 #else 3953 /* sizeof(wchar_t) == 2 */ 3954 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3955 _PyUnicode_WSTR(unicode) = NULL; 3956 Py_FatalError("Impossible unicode object state, wstr " 3957 "and str should share memory already."); 3958 return NULL; 3959 #endif 3960 } 3961 else { 3962 Py_UNREACHABLE(); 3963 } 3964 } 3965 } 3966 if (size != NULL) 3967 *size = PyUnicode_WSTR_LENGTH(unicode); 3968 return _PyUnicode_WSTR(unicode); 3969 } 3970 3971 Py_UNICODE * 3972 PyUnicode_AsUnicode(PyObject *unicode) 3973 { 3974 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3975 } 3976 3977 const Py_UNICODE * 3978 _PyUnicode_AsUnicode(PyObject *unicode) 3979 { 3980 Py_ssize_t size; 3981 const Py_UNICODE *wstr; 3982 3983 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size); 3984 if (wstr && wcslen(wstr) != (size_t)size) { 3985 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3986 return NULL; 3987 } 3988 return wstr; 3989 } 3990 3991 3992 Py_ssize_t 3993 PyUnicode_GetSize(PyObject *unicode) 3994 { 3995 if (!PyUnicode_Check(unicode)) { 3996 PyErr_BadArgument(); 3997 goto onError; 3998 } 3999 if (_PyUnicode_WSTR(unicode) == NULL) { 4000 if (PyUnicode_AsUnicode(unicode) == NULL) 4001 goto onError; 4002 } 4003 return PyUnicode_WSTR_LENGTH(unicode); 4004 4005 onError: 4006 return -1; 4007 } 4008 4009 Py_ssize_t 4010 PyUnicode_GetLength(PyObject *unicode) 4011 { 4012 if (!PyUnicode_Check(unicode)) { 4013 PyErr_BadArgument(); 4014 return -1; 4015 } 4016 if (PyUnicode_READY(unicode) == -1) 4017 return -1; 4018 return PyUnicode_GET_LENGTH(unicode); 4019 } 4020 4021 Py_UCS4 4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4023 { 4024 void *data; 4025 int kind; 4026 4027 if (!PyUnicode_Check(unicode)) { 4028 PyErr_BadArgument(); 4029 return (Py_UCS4)-1; 4030 } 4031 if (PyUnicode_READY(unicode) == -1) { 4032 return (Py_UCS4)-1; 4033 } 4034 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4035 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4036 return (Py_UCS4)-1; 4037 } 4038 data = PyUnicode_DATA(unicode); 4039 kind = PyUnicode_KIND(unicode); 4040 return PyUnicode_READ(kind, data, index); 4041 } 4042 4043 int 4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4045 { 4046 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4047 PyErr_BadArgument(); 4048 return -1; 4049 } 4050 assert(PyUnicode_IS_READY(unicode)); 4051 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4052 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4053 return -1; 4054 } 4055 if (unicode_check_modifiable(unicode)) 4056 return -1; 4057 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4058 PyErr_SetString(PyExc_ValueError, "character out of range"); 4059 return -1; 4060 } 4061 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4062 index, ch); 4063 return 0; 4064 } 4065 4066 const char * 4067 PyUnicode_GetDefaultEncoding(void) 4068 { 4069 return "utf-8"; 4070 } 4071 4072 /* create or adjust a UnicodeDecodeError */ 4073 static void 4074 make_decode_exception(PyObject **exceptionObject, 4075 const char *encoding, 4076 const char *input, Py_ssize_t length, 4077 Py_ssize_t startpos, Py_ssize_t endpos, 4078 const char *reason) 4079 { 4080 if (*exceptionObject == NULL) { 4081 *exceptionObject = PyUnicodeDecodeError_Create( 4082 encoding, input, length, startpos, endpos, reason); 4083 } 4084 else { 4085 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4086 goto onError; 4087 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4088 goto onError; 4089 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4090 goto onError; 4091 } 4092 return; 4093 4094 onError: 4095 Py_CLEAR(*exceptionObject); 4096 } 4097 4098 #ifdef MS_WINDOWS 4099 /* error handling callback helper: 4100 build arguments, call the callback and check the arguments, 4101 if no exception occurred, copy the replacement to the output 4102 and adjust various state variables. 4103 return 0 on success, -1 on error 4104 */ 4105 4106 static int 4107 unicode_decode_call_errorhandler_wchar( 4108 const char *errors, PyObject **errorHandler, 4109 const char *encoding, const char *reason, 4110 const char **input, const char **inend, Py_ssize_t *startinpos, 4111 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4112 PyObject **output, Py_ssize_t *outpos) 4113 { 4114 static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4115 4116 PyObject *restuple = NULL; 4117 PyObject *repunicode = NULL; 4118 Py_ssize_t outsize; 4119 Py_ssize_t insize; 4120 Py_ssize_t requiredsize; 4121 Py_ssize_t newpos; 4122 PyObject *inputobj = NULL; 4123 wchar_t *repwstr; 4124 Py_ssize_t repwlen; 4125 4126 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4127 outsize = _PyUnicode_WSTR_LENGTH(*output); 4128 4129 if (*errorHandler == NULL) { 4130 *errorHandler = PyCodec_LookupError(errors); 4131 if (*errorHandler == NULL) 4132 goto onError; 4133 } 4134 4135 make_decode_exception(exceptionObject, 4136 encoding, 4137 *input, *inend - *input, 4138 *startinpos, *endinpos, 4139 reason); 4140 if (*exceptionObject == NULL) 4141 goto onError; 4142 4143 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4144 if (restuple == NULL) 4145 goto onError; 4146 if (!PyTuple_Check(restuple)) { 4147 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4148 goto onError; 4149 } 4150 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4151 goto onError; 4152 4153 /* Copy back the bytes variables, which might have been modified by the 4154 callback */ 4155 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4156 if (!inputobj) 4157 goto onError; 4158 *input = PyBytes_AS_STRING(inputobj); 4159 insize = PyBytes_GET_SIZE(inputobj); 4160 *inend = *input + insize; 4161 /* we can DECREF safely, as the exception has another reference, 4162 so the object won't go away. */ 4163 Py_DECREF(inputobj); 4164 4165 if (newpos<0) 4166 newpos = insize+newpos; 4167 if (newpos<0 || newpos>insize) { 4168 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4169 goto onError; 4170 } 4171 4172 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4173 if (repwstr == NULL) 4174 goto onError; 4175 /* need more space? (at least enough for what we 4176 have+the replacement+the rest of the string (starting 4177 at the new input position), so we won't have to check space 4178 when there are no errors in the rest of the string) */ 4179 requiredsize = *outpos; 4180 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4181 goto overflow; 4182 requiredsize += repwlen; 4183 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4184 goto overflow; 4185 requiredsize += insize - newpos; 4186 if (requiredsize > outsize) { 4187 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4188 requiredsize = 2*outsize; 4189 if (unicode_resize(output, requiredsize) < 0) 4190 goto onError; 4191 } 4192 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4193 *outpos += repwlen; 4194 *endinpos = newpos; 4195 *inptr = *input + newpos; 4196 4197 /* we made it! */ 4198 Py_DECREF(restuple); 4199 return 0; 4200 4201 overflow: 4202 PyErr_SetString(PyExc_OverflowError, 4203 "decoded result is too long for a Python string"); 4204 4205 onError: 4206 Py_XDECREF(restuple); 4207 return -1; 4208 } 4209 #endif /* MS_WINDOWS */ 4210 4211 static int 4212 unicode_decode_call_errorhandler_writer( 4213 const char *errors, PyObject **errorHandler, 4214 const char *encoding, const char *reason, 4215 const char **input, const char **inend, Py_ssize_t *startinpos, 4216 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4217 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4218 { 4219 static const char *argparse = "Un;decoding error handler must return (str, int) tuple"; 4220 4221 PyObject *restuple = NULL; 4222 PyObject *repunicode = NULL; 4223 Py_ssize_t insize; 4224 Py_ssize_t newpos; 4225 Py_ssize_t replen; 4226 Py_ssize_t remain; 4227 PyObject *inputobj = NULL; 4228 int need_to_grow = 0; 4229 const char *new_inptr; 4230 4231 if (*errorHandler == NULL) { 4232 *errorHandler = PyCodec_LookupError(errors); 4233 if (*errorHandler == NULL) 4234 goto onError; 4235 } 4236 4237 make_decode_exception(exceptionObject, 4238 encoding, 4239 *input, *inend - *input, 4240 *startinpos, *endinpos, 4241 reason); 4242 if (*exceptionObject == NULL) 4243 goto onError; 4244 4245 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4246 if (restuple == NULL) 4247 goto onError; 4248 if (!PyTuple_Check(restuple)) { 4249 PyErr_SetString(PyExc_TypeError, &argparse[3]); 4250 goto onError; 4251 } 4252 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos)) 4253 goto onError; 4254 4255 /* Copy back the bytes variables, which might have been modified by the 4256 callback */ 4257 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4258 if (!inputobj) 4259 goto onError; 4260 remain = *inend - *input - *endinpos; 4261 *input = PyBytes_AS_STRING(inputobj); 4262 insize = PyBytes_GET_SIZE(inputobj); 4263 *inend = *input + insize; 4264 /* we can DECREF safely, as the exception has another reference, 4265 so the object won't go away. */ 4266 Py_DECREF(inputobj); 4267 4268 if (newpos<0) 4269 newpos = insize+newpos; 4270 if (newpos<0 || newpos>insize) { 4271 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4272 goto onError; 4273 } 4274 4275 replen = PyUnicode_GET_LENGTH(repunicode); 4276 if (replen > 1) { 4277 writer->min_length += replen - 1; 4278 need_to_grow = 1; 4279 } 4280 new_inptr = *input + newpos; 4281 if (*inend - new_inptr > remain) { 4282 /* We don't know the decoding algorithm here so we make the worst 4283 assumption that one byte decodes to one unicode character. 4284 If unfortunately one byte could decode to more unicode characters, 4285 the decoder may write out-of-bound then. Is it possible for the 4286 algorithms using this function? */ 4287 writer->min_length += *inend - new_inptr - remain; 4288 need_to_grow = 1; 4289 } 4290 if (need_to_grow) { 4291 writer->overallocate = 1; 4292 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos, 4293 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4294 goto onError; 4295 } 4296 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4297 goto onError; 4298 4299 *endinpos = newpos; 4300 *inptr = new_inptr; 4301 4302 /* we made it! */ 4303 Py_DECREF(restuple); 4304 return 0; 4305 4306 onError: 4307 Py_XDECREF(restuple); 4308 return -1; 4309 } 4310 4311 /* --- UTF-7 Codec -------------------------------------------------------- */ 4312 4313 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 4314 4315 /* Three simple macros defining base-64. */ 4316 4317 /* Is c a base-64 character? */ 4318 4319 #define IS_BASE64(c) \ 4320 (((c) >= 'A' && (c) <= 'Z') || \ 4321 ((c) >= 'a' && (c) <= 'z') || \ 4322 ((c) >= '0' && (c) <= '9') || \ 4323 (c) == '+' || (c) == '/') 4324 4325 /* given that c is a base-64 character, what is its base-64 value? */ 4326 4327 #define FROM_BASE64(c) \ 4328 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4329 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4330 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4331 (c) == '+' ? 62 : 63) 4332 4333 /* What is the base-64 character of the bottom 6 bits of n? */ 4334 4335 #define TO_BASE64(n) \ 4336 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4337 4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4339 * decoded as itself. We are permissive on decoding; the only ASCII 4340 * byte not decoding to itself is the + which begins a base64 4341 * string. */ 4342 4343 #define DECODE_DIRECT(c) \ 4344 ((c) <= 127 && (c) != '+') 4345 4346 /* The UTF-7 encoder treats ASCII characters differently according to 4347 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4348 * the above). See RFC2152. This array identifies these different 4349 * sets: 4350 * 0 : "Set D" 4351 * alphanumeric and '(),-./:? 4352 * 1 : "Set O" 4353 * !"#$%&*;<=>@[]^_`{|} 4354 * 2 : "whitespace" 4355 * ht nl cr sp 4356 * 3 : special (must be base64 encoded) 4357 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4358 */ 4359 4360 static 4361 char utf7_category[128] = { 4362 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4363 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4365 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4366 /* sp ! " # $ % & ' ( ) * + , - . / */ 4367 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4368 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4370 /* @ A B C D E F G H I J K L M N O */ 4371 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4372 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4374 /* ` a b c d e f g h i j k l m n o */ 4375 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4376 /* p q r s t u v w x y z { | } ~ del */ 4377 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4378 }; 4379 4380 /* ENCODE_DIRECT: this character should be encoded as itself. The 4381 * answer depends on whether we are encoding set O as itself, and also 4382 * on whether we are encoding whitespace as itself. RFC2152 makes it 4383 * clear that the answers to these questions vary between 4384 * applications, so this code needs to be flexible. */ 4385 4386 #define ENCODE_DIRECT(c, directO, directWS) \ 4387 ((c) < 128 && (c) > 0 && \ 4388 ((utf7_category[(c)] == 0) || \ 4389 (directWS && (utf7_category[(c)] == 2)) || \ 4390 (directO && (utf7_category[(c)] == 1)))) 4391 4392 PyObject * 4393 PyUnicode_DecodeUTF7(const char *s, 4394 Py_ssize_t size, 4395 const char *errors) 4396 { 4397 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4398 } 4399 4400 /* The decoder. The only state we preserve is our read position, 4401 * i.e. how many characters we have consumed. So if we end in the 4402 * middle of a shift sequence we have to back off the read position 4403 * and the output to the beginning of the sequence, otherwise we lose 4404 * all the shift state (seen bits, number of bits seen, high 4405 * surrogate). */ 4406 4407 PyObject * 4408 PyUnicode_DecodeUTF7Stateful(const char *s, 4409 Py_ssize_t size, 4410 const char *errors, 4411 Py_ssize_t *consumed) 4412 { 4413 const char *starts = s; 4414 Py_ssize_t startinpos; 4415 Py_ssize_t endinpos; 4416 const char *e; 4417 _PyUnicodeWriter writer; 4418 const char *errmsg = ""; 4419 int inShift = 0; 4420 Py_ssize_t shiftOutStart; 4421 unsigned int base64bits = 0; 4422 unsigned long base64buffer = 0; 4423 Py_UCS4 surrogate = 0; 4424 PyObject *errorHandler = NULL; 4425 PyObject *exc = NULL; 4426 4427 if (size == 0) { 4428 if (consumed) 4429 *consumed = 0; 4430 _Py_RETURN_UNICODE_EMPTY(); 4431 } 4432 4433 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4434 _PyUnicodeWriter_Init(&writer); 4435 writer.min_length = size; 4436 4437 shiftOutStart = 0; 4438 e = s + size; 4439 4440 while (s < e) { 4441 Py_UCS4 ch; 4442 restart: 4443 ch = (unsigned char) *s; 4444 4445 if (inShift) { /* in a base-64 section */ 4446 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4447 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4448 base64bits += 6; 4449 s++; 4450 if (base64bits >= 16) { 4451 /* we have enough bits for a UTF-16 value */ 4452 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4453 base64bits -= 16; 4454 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4455 assert(outCh <= 0xffff); 4456 if (surrogate) { 4457 /* expecting a second surrogate */ 4458 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4459 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4460 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4461 goto onError; 4462 surrogate = 0; 4463 continue; 4464 } 4465 else { 4466 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4467 goto onError; 4468 surrogate = 0; 4469 } 4470 } 4471 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4472 /* first surrogate */ 4473 surrogate = outCh; 4474 } 4475 else { 4476 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4477 goto onError; 4478 } 4479 } 4480 } 4481 else { /* now leaving a base-64 section */ 4482 inShift = 0; 4483 if (base64bits > 0) { /* left-over bits */ 4484 if (base64bits >= 6) { 4485 /* We've seen at least one base-64 character */ 4486 s++; 4487 errmsg = "partial character in shift sequence"; 4488 goto utf7Error; 4489 } 4490 else { 4491 /* Some bits remain; they should be zero */ 4492 if (base64buffer != 0) { 4493 s++; 4494 errmsg = "non-zero padding bits in shift sequence"; 4495 goto utf7Error; 4496 } 4497 } 4498 } 4499 if (surrogate && DECODE_DIRECT(ch)) { 4500 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4501 goto onError; 4502 } 4503 surrogate = 0; 4504 if (ch == '-') { 4505 /* '-' is absorbed; other terminating 4506 characters are preserved */ 4507 s++; 4508 } 4509 } 4510 } 4511 else if ( ch == '+' ) { 4512 startinpos = s-starts; 4513 s++; /* consume '+' */ 4514 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4515 s++; 4516 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4517 goto onError; 4518 } 4519 else { /* begin base64-encoded section */ 4520 inShift = 1; 4521 surrogate = 0; 4522 shiftOutStart = writer.pos; 4523 base64bits = 0; 4524 base64buffer = 0; 4525 } 4526 } 4527 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4528 s++; 4529 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4530 goto onError; 4531 } 4532 else { 4533 startinpos = s-starts; 4534 s++; 4535 errmsg = "unexpected special character"; 4536 goto utf7Error; 4537 } 4538 continue; 4539 utf7Error: 4540 endinpos = s-starts; 4541 if (unicode_decode_call_errorhandler_writer( 4542 errors, &errorHandler, 4543 "utf7", errmsg, 4544 &starts, &e, &startinpos, &endinpos, &exc, &s, 4545 &writer)) 4546 goto onError; 4547 } 4548 4549 /* end of string */ 4550 4551 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4552 /* if we're in an inconsistent state, that's an error */ 4553 inShift = 0; 4554 if (surrogate || 4555 (base64bits >= 6) || 4556 (base64bits > 0 && base64buffer != 0)) { 4557 endinpos = size; 4558 if (unicode_decode_call_errorhandler_writer( 4559 errors, &errorHandler, 4560 "utf7", "unterminated shift sequence", 4561 &starts, &e, &startinpos, &endinpos, &exc, &s, 4562 &writer)) 4563 goto onError; 4564 if (s < e) 4565 goto restart; 4566 } 4567 } 4568 4569 /* return state */ 4570 if (consumed) { 4571 if (inShift) { 4572 *consumed = startinpos; 4573 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4574 PyObject *result = PyUnicode_FromKindAndData( 4575 writer.kind, writer.data, shiftOutStart); 4576 Py_XDECREF(errorHandler); 4577 Py_XDECREF(exc); 4578 _PyUnicodeWriter_Dealloc(&writer); 4579 return result; 4580 } 4581 writer.pos = shiftOutStart; /* back off output */ 4582 } 4583 else { 4584 *consumed = s-starts; 4585 } 4586 } 4587 4588 Py_XDECREF(errorHandler); 4589 Py_XDECREF(exc); 4590 return _PyUnicodeWriter_Finish(&writer); 4591 4592 onError: 4593 Py_XDECREF(errorHandler); 4594 Py_XDECREF(exc); 4595 _PyUnicodeWriter_Dealloc(&writer); 4596 return NULL; 4597 } 4598 4599 4600 PyObject * 4601 _PyUnicode_EncodeUTF7(PyObject *str, 4602 int base64SetO, 4603 int base64WhiteSpace, 4604 const char *errors) 4605 { 4606 int kind; 4607 void *data; 4608 Py_ssize_t len; 4609 PyObject *v; 4610 int inShift = 0; 4611 Py_ssize_t i; 4612 unsigned int base64bits = 0; 4613 unsigned long base64buffer = 0; 4614 char * out; 4615 char * start; 4616 4617 if (PyUnicode_READY(str) == -1) 4618 return NULL; 4619 kind = PyUnicode_KIND(str); 4620 data = PyUnicode_DATA(str); 4621 len = PyUnicode_GET_LENGTH(str); 4622 4623 if (len == 0) 4624 return PyBytes_FromStringAndSize(NULL, 0); 4625 4626 /* It might be possible to tighten this worst case */ 4627 if (len > PY_SSIZE_T_MAX / 8) 4628 return PyErr_NoMemory(); 4629 v = PyBytes_FromStringAndSize(NULL, len * 8); 4630 if (v == NULL) 4631 return NULL; 4632 4633 start = out = PyBytes_AS_STRING(v); 4634 for (i = 0; i < len; ++i) { 4635 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4636 4637 if (inShift) { 4638 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4639 /* shifting out */ 4640 if (base64bits) { /* output remaining bits */ 4641 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4642 base64buffer = 0; 4643 base64bits = 0; 4644 } 4645 inShift = 0; 4646 /* Characters not in the BASE64 set implicitly unshift the sequence 4647 so no '-' is required, except if the character is itself a '-' */ 4648 if (IS_BASE64(ch) || ch == '-') { 4649 *out++ = '-'; 4650 } 4651 *out++ = (char) ch; 4652 } 4653 else { 4654 goto encode_char; 4655 } 4656 } 4657 else { /* not in a shift sequence */ 4658 if (ch == '+') { 4659 *out++ = '+'; 4660 *out++ = '-'; 4661 } 4662 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4663 *out++ = (char) ch; 4664 } 4665 else { 4666 *out++ = '+'; 4667 inShift = 1; 4668 goto encode_char; 4669 } 4670 } 4671 continue; 4672 encode_char: 4673 if (ch >= 0x10000) { 4674 assert(ch <= MAX_UNICODE); 4675 4676 /* code first surrogate */ 4677 base64bits += 16; 4678 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4679 while (base64bits >= 6) { 4680 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4681 base64bits -= 6; 4682 } 4683 /* prepare second surrogate */ 4684 ch = Py_UNICODE_LOW_SURROGATE(ch); 4685 } 4686 base64bits += 16; 4687 base64buffer = (base64buffer << 16) | ch; 4688 while (base64bits >= 6) { 4689 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4690 base64bits -= 6; 4691 } 4692 } 4693 if (base64bits) 4694 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4695 if (inShift) 4696 *out++ = '-'; 4697 if (_PyBytes_Resize(&v, out - start) < 0) 4698 return NULL; 4699 return v; 4700 } 4701 PyObject * 4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4703 Py_ssize_t size, 4704 int base64SetO, 4705 int base64WhiteSpace, 4706 const char *errors) 4707 { 4708 PyObject *result; 4709 PyObject *tmp = PyUnicode_FromWideChar(s, size); 4710 if (tmp == NULL) 4711 return NULL; 4712 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4713 base64WhiteSpace, errors); 4714 Py_DECREF(tmp); 4715 return result; 4716 } 4717 4718 #undef IS_BASE64 4719 #undef FROM_BASE64 4720 #undef TO_BASE64 4721 #undef DECODE_DIRECT 4722 #undef ENCODE_DIRECT 4723 4724 /* --- UTF-8 Codec -------------------------------------------------------- */ 4725 4726 PyObject * 4727 PyUnicode_DecodeUTF8(const char *s, 4728 Py_ssize_t size, 4729 const char *errors) 4730 { 4731 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4732 } 4733 4734 #include "stringlib/asciilib.h" 4735 #include "stringlib/codecs.h" 4736 #include "stringlib/undef.h" 4737 4738 #include "stringlib/ucs1lib.h" 4739 #include "stringlib/codecs.h" 4740 #include "stringlib/undef.h" 4741 4742 #include "stringlib/ucs2lib.h" 4743 #include "stringlib/codecs.h" 4744 #include "stringlib/undef.h" 4745 4746 #include "stringlib/ucs4lib.h" 4747 #include "stringlib/codecs.h" 4748 #include "stringlib/undef.h" 4749 4750 /* Mask to quickly check whether a C 'long' contains a 4751 non-ASCII, UTF8-encoded char. */ 4752 #if (SIZEOF_LONG == 8) 4753 # define ASCII_CHAR_MASK 0x8080808080808080UL 4754 #elif (SIZEOF_LONG == 4) 4755 # define ASCII_CHAR_MASK 0x80808080UL 4756 #else 4757 # error C 'long' size should be either 4 or 8! 4758 #endif 4759 4760 static Py_ssize_t 4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4762 { 4763 const char *p = start; 4764 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4765 4766 /* 4767 * Issue #17237: m68k is a bit different from most architectures in 4768 * that objects do not use "natural alignment" - for example, int and 4769 * long are only aligned at 2-byte boundaries. Therefore the assert() 4770 * won't work; also, tests have shown that skipping the "optimised 4771 * version" will even speed up m68k. 4772 */ 4773 #if !defined(__m68k__) 4774 #if SIZEOF_LONG <= SIZEOF_VOID_P 4775 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4776 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4777 /* Fast path, see in STRINGLIB(utf8_decode) for 4778 an explanation. */ 4779 /* Help allocation */ 4780 const char *_p = p; 4781 Py_UCS1 * q = dest; 4782 while (_p < aligned_end) { 4783 unsigned long value = *(const unsigned long *) _p; 4784 if (value & ASCII_CHAR_MASK) 4785 break; 4786 *((unsigned long *)q) = value; 4787 _p += SIZEOF_LONG; 4788 q += SIZEOF_LONG; 4789 } 4790 p = _p; 4791 while (p < end) { 4792 if ((unsigned char)*p & 0x80) 4793 break; 4794 *q++ = *p++; 4795 } 4796 return p - start; 4797 } 4798 #endif 4799 #endif 4800 while (p < end) { 4801 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4802 for an explanation. */ 4803 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4804 /* Help allocation */ 4805 const char *_p = p; 4806 while (_p < aligned_end) { 4807 unsigned long value = *(unsigned long *) _p; 4808 if (value & ASCII_CHAR_MASK) 4809 break; 4810 _p += SIZEOF_LONG; 4811 } 4812 p = _p; 4813 if (_p == end) 4814 break; 4815 } 4816 if ((unsigned char)*p & 0x80) 4817 break; 4818 ++p; 4819 } 4820 memcpy(dest, start, p - start); 4821 return p - start; 4822 } 4823 4824 PyObject * 4825 PyUnicode_DecodeUTF8Stateful(const char *s, 4826 Py_ssize_t size, 4827 const char *errors, 4828 Py_ssize_t *consumed) 4829 { 4830 _PyUnicodeWriter writer; 4831 const char *starts = s; 4832 const char *end = s + size; 4833 4834 Py_ssize_t startinpos; 4835 Py_ssize_t endinpos; 4836 const char *errmsg = ""; 4837 PyObject *error_handler_obj = NULL; 4838 PyObject *exc = NULL; 4839 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 4840 4841 if (size == 0) { 4842 if (consumed) 4843 *consumed = 0; 4844 _Py_RETURN_UNICODE_EMPTY(); 4845 } 4846 4847 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4848 if (size == 1 && (unsigned char)s[0] < 128) { 4849 if (consumed) 4850 *consumed = 1; 4851 return get_latin1_char((unsigned char)s[0]); 4852 } 4853 4854 _PyUnicodeWriter_Init(&writer); 4855 writer.min_length = size; 4856 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4857 goto onError; 4858 4859 writer.pos = ascii_decode(s, end, writer.data); 4860 s += writer.pos; 4861 while (s < end) { 4862 Py_UCS4 ch; 4863 int kind = writer.kind; 4864 4865 if (kind == PyUnicode_1BYTE_KIND) { 4866 if (PyUnicode_IS_ASCII(writer.buffer)) 4867 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4868 else 4869 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4870 } else if (kind == PyUnicode_2BYTE_KIND) { 4871 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4872 } else { 4873 assert(kind == PyUnicode_4BYTE_KIND); 4874 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4875 } 4876 4877 switch (ch) { 4878 case 0: 4879 if (s == end || consumed) 4880 goto End; 4881 errmsg = "unexpected end of data"; 4882 startinpos = s - starts; 4883 endinpos = end - starts; 4884 break; 4885 case 1: 4886 errmsg = "invalid start byte"; 4887 startinpos = s - starts; 4888 endinpos = startinpos + 1; 4889 break; 4890 case 2: 4891 case 3: 4892 case 4: 4893 errmsg = "invalid continuation byte"; 4894 startinpos = s - starts; 4895 endinpos = startinpos + ch - 1; 4896 break; 4897 default: 4898 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4899 goto onError; 4900 continue; 4901 } 4902 4903 if (error_handler == _Py_ERROR_UNKNOWN) 4904 error_handler = get_error_handler(errors); 4905 4906 switch (error_handler) { 4907 case _Py_ERROR_IGNORE: 4908 s += (endinpos - startinpos); 4909 break; 4910 4911 case _Py_ERROR_REPLACE: 4912 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) 4913 goto onError; 4914 s += (endinpos - startinpos); 4915 break; 4916 4917 case _Py_ERROR_SURROGATEESCAPE: 4918 { 4919 Py_ssize_t i; 4920 4921 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 4922 goto onError; 4923 for (i=startinpos; i<endinpos; i++) { 4924 ch = (Py_UCS4)(unsigned char)(starts[i]); 4925 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, 4926 ch + 0xdc00); 4927 writer.pos++; 4928 } 4929 s += (endinpos - startinpos); 4930 break; 4931 } 4932 4933 default: 4934 if (unicode_decode_call_errorhandler_writer( 4935 errors, &error_handler_obj, 4936 "utf-8", errmsg, 4937 &starts, &end, &startinpos, &endinpos, &exc, &s, 4938 &writer)) 4939 goto onError; 4940 } 4941 } 4942 4943 End: 4944 if (consumed) 4945 *consumed = s - starts; 4946 4947 Py_XDECREF(error_handler_obj); 4948 Py_XDECREF(exc); 4949 return _PyUnicodeWriter_Finish(&writer); 4950 4951 onError: 4952 Py_XDECREF(error_handler_obj); 4953 Py_XDECREF(exc); 4954 _PyUnicodeWriter_Dealloc(&writer); 4955 return NULL; 4956 } 4957 4958 4959 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is 4960 non-zero, use strict error handler otherwise. 4961 4962 On success, write a pointer to a newly allocated wide character string into 4963 *wstr (use PyMem_RawFree() to free the memory) and write the output length 4964 (in number of wchar_t units) into *wlen (if wlen is set). 4965 4966 On memory allocation failure, return -1. 4967 4968 On decoding error (if surrogateescape is zero), return -2. If wlen is 4969 non-NULL, write the start of the illegal byte sequence into *wlen. If reason 4970 is not NULL, write the decoding error message into *reason. */ 4971 int 4972 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, 4973 const char **reason, int surrogateescape) 4974 { 4975 const char *orig_s = s; 4976 const char *e; 4977 wchar_t *unicode; 4978 Py_ssize_t outpos; 4979 4980 /* Note: size will always be longer than the resulting Unicode 4981 character count */ 4982 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { 4983 return -1; 4984 } 4985 4986 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4987 if (!unicode) { 4988 return -1; 4989 } 4990 4991 /* Unpack UTF-8 encoded data */ 4992 e = s + size; 4993 outpos = 0; 4994 while (s < e) { 4995 Py_UCS4 ch; 4996 #if SIZEOF_WCHAR_T == 4 4997 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4998 #else 4999 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 5000 #endif 5001 if (ch > 0xFF) { 5002 #if SIZEOF_WCHAR_T == 4 5003 Py_UNREACHABLE(); 5004 #else 5005 assert(ch > 0xFFFF && ch <= MAX_UNICODE); 5006 /* write a surrogate pair */ 5007 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 5008 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 5009 #endif 5010 } 5011 else { 5012 if (!ch && s == e) 5013 break; 5014 if (!surrogateescape) { 5015 PyMem_RawFree(unicode ); 5016 if (reason != NULL) { 5017 switch (ch) { 5018 case 0: 5019 *reason = "unexpected end of data"; 5020 break; 5021 case 1: 5022 *reason = "invalid start byte"; 5023 break; 5024 /* 2, 3, 4 */ 5025 default: 5026 *reason = "invalid continuation byte"; 5027 break; 5028 } 5029 } 5030 if (wlen != NULL) { 5031 *wlen = s - orig_s; 5032 } 5033 return -2; 5034 } 5035 /* surrogateescape */ 5036 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 5037 } 5038 } 5039 unicode[outpos] = L'\0'; 5040 if (wlen) { 5041 *wlen = outpos; 5042 } 5043 *wstr = unicode; 5044 return 0; 5045 } 5046 5047 wchar_t* 5048 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) 5049 { 5050 wchar_t *wstr; 5051 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1); 5052 if (res != 0) { 5053 return NULL; 5054 } 5055 return wstr; 5056 } 5057 5058 5059 /* UTF-8 encoder using the surrogateescape error handler . 5060 5061 On success, return 0 and write the newly allocated character string (use 5062 PyMem_Free() to free the memory) into *str. 5063 5064 On encoding failure, return -2 and write the position of the invalid 5065 surrogate character into *error_pos (if error_pos is set) and the decoding 5066 error message into *reason (if reason is set). 5067 5068 On memory allocation failure, return -1. */ 5069 int 5070 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, 5071 const char **reason, int raw_malloc, int surrogateescape) 5072 { 5073 const Py_ssize_t max_char_size = 4; 5074 Py_ssize_t len = wcslen(text); 5075 5076 assert(len >= 0); 5077 5078 if (len > PY_SSIZE_T_MAX / max_char_size - 1) { 5079 return -1; 5080 } 5081 char *bytes; 5082 if (raw_malloc) { 5083 bytes = PyMem_RawMalloc((len + 1) * max_char_size); 5084 } 5085 else { 5086 bytes = PyMem_Malloc((len + 1) * max_char_size); 5087 } 5088 if (bytes == NULL) { 5089 return -1; 5090 } 5091 5092 char *p = bytes; 5093 Py_ssize_t i; 5094 for (i = 0; i < len; i++) { 5095 Py_UCS4 ch = text[i]; 5096 5097 if (ch < 0x80) { 5098 /* Encode ASCII */ 5099 *p++ = (char) ch; 5100 5101 } 5102 else if (ch < 0x0800) { 5103 /* Encode Latin-1 */ 5104 *p++ = (char)(0xc0 | (ch >> 6)); 5105 *p++ = (char)(0x80 | (ch & 0x3f)); 5106 } 5107 else if (Py_UNICODE_IS_SURROGATE(ch)) { 5108 /* surrogateescape error handler */ 5109 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { 5110 if (error_pos != NULL) { 5111 *error_pos = (size_t)i; 5112 } 5113 if (reason != NULL) { 5114 *reason = "encoding error"; 5115 } 5116 if (raw_malloc) { 5117 PyMem_RawFree(bytes); 5118 } 5119 else { 5120 PyMem_Free(bytes); 5121 } 5122 return -2; 5123 } 5124 *p++ = (char)(ch & 0xff); 5125 } 5126 else if (ch < 0x10000) { 5127 *p++ = (char)(0xe0 | (ch >> 12)); 5128 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 5129 *p++ = (char)(0x80 | (ch & 0x3f)); 5130 } 5131 else { /* ch >= 0x10000 */ 5132 assert(ch <= MAX_UNICODE); 5133 /* Encode UCS4 Unicode ordinals */ 5134 *p++ = (char)(0xf0 | (ch >> 18)); 5135 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 5136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 5137 *p++ = (char)(0x80 | (ch & 0x3f)); 5138 } 5139 } 5140 *p++ = '\0'; 5141 5142 size_t final_size = (p - bytes); 5143 char *bytes2; 5144 if (raw_malloc) { 5145 bytes2 = PyMem_RawRealloc(bytes, final_size); 5146 } 5147 else { 5148 bytes2 = PyMem_Realloc(bytes, final_size); 5149 } 5150 if (bytes2 == NULL) { 5151 if (error_pos != NULL) { 5152 *error_pos = (size_t)-1; 5153 } 5154 if (raw_malloc) { 5155 PyMem_RawFree(bytes); 5156 } 5157 else { 5158 PyMem_Free(bytes); 5159 } 5160 return -1; 5161 } 5162 *str = bytes2; 5163 return 0; 5164 } 5165 5166 5167 /* Primary internal function which creates utf8 encoded bytes objects. 5168 5169 Allocation strategy: if the string is short, convert into a stack buffer 5170 and allocate exactly as much space needed at the end. Else allocate the 5171 maximum possible needed (4 result bytes per Unicode character), and return 5172 the excess memory at the end. 5173 */ 5174 PyObject * 5175 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 5176 { 5177 enum PyUnicode_Kind kind; 5178 void *data; 5179 Py_ssize_t size; 5180 5181 if (!PyUnicode_Check(unicode)) { 5182 PyErr_BadArgument(); 5183 return NULL; 5184 } 5185 5186 if (PyUnicode_READY(unicode) == -1) 5187 return NULL; 5188 5189 if (PyUnicode_UTF8(unicode)) 5190 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 5191 PyUnicode_UTF8_LENGTH(unicode)); 5192 5193 kind = PyUnicode_KIND(unicode); 5194 data = PyUnicode_DATA(unicode); 5195 size = PyUnicode_GET_LENGTH(unicode); 5196 5197 switch (kind) { 5198 default: 5199 Py_UNREACHABLE(); 5200 case PyUnicode_1BYTE_KIND: 5201 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 5202 assert(!PyUnicode_IS_ASCII(unicode)); 5203 return ucs1lib_utf8_encoder(unicode, data, size, errors); 5204 case PyUnicode_2BYTE_KIND: 5205 return ucs2lib_utf8_encoder(unicode, data, size, errors); 5206 case PyUnicode_4BYTE_KIND: 5207 return ucs4lib_utf8_encoder(unicode, data, size, errors); 5208 } 5209 } 5210 5211 PyObject * 5212 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 5213 Py_ssize_t size, 5214 const char *errors) 5215 { 5216 PyObject *v, *unicode; 5217 5218 unicode = PyUnicode_FromWideChar(s, size); 5219 if (unicode == NULL) 5220 return NULL; 5221 v = _PyUnicode_AsUTF8String(unicode, errors); 5222 Py_DECREF(unicode); 5223 return v; 5224 } 5225 5226 PyObject * 5227 PyUnicode_AsUTF8String(PyObject *unicode) 5228 { 5229 return _PyUnicode_AsUTF8String(unicode, NULL); 5230 } 5231 5232 /* --- UTF-32 Codec ------------------------------------------------------- */ 5233 5234 PyObject * 5235 PyUnicode_DecodeUTF32(const char *s, 5236 Py_ssize_t size, 5237 const char *errors, 5238 int *byteorder) 5239 { 5240 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 5241 } 5242 5243 PyObject * 5244 PyUnicode_DecodeUTF32Stateful(const char *s, 5245 Py_ssize_t size, 5246 const char *errors, 5247 int *byteorder, 5248 Py_ssize_t *consumed) 5249 { 5250 const char *starts = s; 5251 Py_ssize_t startinpos; 5252 Py_ssize_t endinpos; 5253 _PyUnicodeWriter writer; 5254 const unsigned char *q, *e; 5255 int le, bo = 0; /* assume native ordering by default */ 5256 const char *encoding; 5257 const char *errmsg = ""; 5258 PyObject *errorHandler = NULL; 5259 PyObject *exc = NULL; 5260 5261 q = (unsigned char *)s; 5262 e = q + size; 5263 5264 if (byteorder) 5265 bo = *byteorder; 5266 5267 /* Check for BOM marks (U+FEFF) in the input and adjust current 5268 byte order setting accordingly. In native mode, the leading BOM 5269 mark is skipped, in all other modes, it is copied to the output 5270 stream as-is (giving a ZWNBSP character). */ 5271 if (bo == 0 && size >= 4) { 5272 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5273 if (bom == 0x0000FEFF) { 5274 bo = -1; 5275 q += 4; 5276 } 5277 else if (bom == 0xFFFE0000) { 5278 bo = 1; 5279 q += 4; 5280 } 5281 if (byteorder) 5282 *byteorder = bo; 5283 } 5284 5285 if (q == e) { 5286 if (consumed) 5287 *consumed = size; 5288 _Py_RETURN_UNICODE_EMPTY(); 5289 } 5290 5291 #ifdef WORDS_BIGENDIAN 5292 le = bo < 0; 5293 #else 5294 le = bo <= 0; 5295 #endif 5296 encoding = le ? "utf-32-le" : "utf-32-be"; 5297 5298 _PyUnicodeWriter_Init(&writer); 5299 writer.min_length = (e - q + 3) / 4; 5300 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5301 goto onError; 5302 5303 while (1) { 5304 Py_UCS4 ch = 0; 5305 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5306 5307 if (e - q >= 4) { 5308 enum PyUnicode_Kind kind = writer.kind; 5309 void *data = writer.data; 5310 const unsigned char *last = e - 4; 5311 Py_ssize_t pos = writer.pos; 5312 if (le) { 5313 do { 5314 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5315 if (ch > maxch) 5316 break; 5317 if (kind != PyUnicode_1BYTE_KIND && 5318 Py_UNICODE_IS_SURROGATE(ch)) 5319 break; 5320 PyUnicode_WRITE(kind, data, pos++, ch); 5321 q += 4; 5322 } while (q <= last); 5323 } 5324 else { 5325 do { 5326 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5327 if (ch > maxch) 5328 break; 5329 if (kind != PyUnicode_1BYTE_KIND && 5330 Py_UNICODE_IS_SURROGATE(ch)) 5331 break; 5332 PyUnicode_WRITE(kind, data, pos++, ch); 5333 q += 4; 5334 } while (q <= last); 5335 } 5336 writer.pos = pos; 5337 } 5338 5339 if (Py_UNICODE_IS_SURROGATE(ch)) { 5340 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5341 startinpos = ((const char *)q) - starts; 5342 endinpos = startinpos + 4; 5343 } 5344 else if (ch <= maxch) { 5345 if (q == e || consumed) 5346 break; 5347 /* remaining bytes at the end? (size should be divisible by 4) */ 5348 errmsg = "truncated data"; 5349 startinpos = ((const char *)q) - starts; 5350 endinpos = ((const char *)e) - starts; 5351 } 5352 else { 5353 if (ch < 0x110000) { 5354 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5355 goto onError; 5356 q += 4; 5357 continue; 5358 } 5359 errmsg = "code point not in range(0x110000)"; 5360 startinpos = ((const char *)q) - starts; 5361 endinpos = startinpos + 4; 5362 } 5363 5364 /* The remaining input chars are ignored if the callback 5365 chooses to skip the input */ 5366 if (unicode_decode_call_errorhandler_writer( 5367 errors, &errorHandler, 5368 encoding, errmsg, 5369 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5370 &writer)) 5371 goto onError; 5372 } 5373 5374 if (consumed) 5375 *consumed = (const char *)q-starts; 5376 5377 Py_XDECREF(errorHandler); 5378 Py_XDECREF(exc); 5379 return _PyUnicodeWriter_Finish(&writer); 5380 5381 onError: 5382 _PyUnicodeWriter_Dealloc(&writer); 5383 Py_XDECREF(errorHandler); 5384 Py_XDECREF(exc); 5385 return NULL; 5386 } 5387 5388 PyObject * 5389 _PyUnicode_EncodeUTF32(PyObject *str, 5390 const char *errors, 5391 int byteorder) 5392 { 5393 enum PyUnicode_Kind kind; 5394 const void *data; 5395 Py_ssize_t len; 5396 PyObject *v; 5397 uint32_t *out; 5398 #if PY_LITTLE_ENDIAN 5399 int native_ordering = byteorder <= 0; 5400 #else 5401 int native_ordering = byteorder >= 0; 5402 #endif 5403 const char *encoding; 5404 Py_ssize_t nsize, pos; 5405 PyObject *errorHandler = NULL; 5406 PyObject *exc = NULL; 5407 PyObject *rep = NULL; 5408 5409 if (!PyUnicode_Check(str)) { 5410 PyErr_BadArgument(); 5411 return NULL; 5412 } 5413 if (PyUnicode_READY(str) == -1) 5414 return NULL; 5415 kind = PyUnicode_KIND(str); 5416 data = PyUnicode_DATA(str); 5417 len = PyUnicode_GET_LENGTH(str); 5418 5419 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5420 return PyErr_NoMemory(); 5421 nsize = len + (byteorder == 0); 5422 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5423 if (v == NULL) 5424 return NULL; 5425 5426 /* output buffer is 4-bytes aligned */ 5427 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5428 out = (uint32_t *)PyBytes_AS_STRING(v); 5429 if (byteorder == 0) 5430 *out++ = 0xFEFF; 5431 if (len == 0) 5432 goto done; 5433 5434 if (byteorder == -1) 5435 encoding = "utf-32-le"; 5436 else if (byteorder == 1) 5437 encoding = "utf-32-be"; 5438 else 5439 encoding = "utf-32"; 5440 5441 if (kind == PyUnicode_1BYTE_KIND) { 5442 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5443 goto done; 5444 } 5445 5446 pos = 0; 5447 while (pos < len) { 5448 Py_ssize_t repsize, moreunits; 5449 5450 if (kind == PyUnicode_2BYTE_KIND) { 5451 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5452 &out, native_ordering); 5453 } 5454 else { 5455 assert(kind == PyUnicode_4BYTE_KIND); 5456 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5457 &out, native_ordering); 5458 } 5459 if (pos == len) 5460 break; 5461 5462 rep = unicode_encode_call_errorhandler( 5463 errors, &errorHandler, 5464 encoding, "surrogates not allowed", 5465 str, &exc, pos, pos + 1, &pos); 5466 if (!rep) 5467 goto error; 5468 5469 if (PyBytes_Check(rep)) { 5470 repsize = PyBytes_GET_SIZE(rep); 5471 if (repsize & 3) { 5472 raise_encode_exception(&exc, encoding, 5473 str, pos - 1, pos, 5474 "surrogates not allowed"); 5475 goto error; 5476 } 5477 moreunits = repsize / 4; 5478 } 5479 else { 5480 assert(PyUnicode_Check(rep)); 5481 if (PyUnicode_READY(rep) < 0) 5482 goto error; 5483 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5484 if (!PyUnicode_IS_ASCII(rep)) { 5485 raise_encode_exception(&exc, encoding, 5486 str, pos - 1, pos, 5487 "surrogates not allowed"); 5488 goto error; 5489 } 5490 } 5491 5492 /* four bytes are reserved for each surrogate */ 5493 if (moreunits > 1) { 5494 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); 5495 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { 5496 /* integer overflow */ 5497 PyErr_NoMemory(); 5498 goto error; 5499 } 5500 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0) 5501 goto error; 5502 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; 5503 } 5504 5505 if (PyBytes_Check(rep)) { 5506 memcpy(out, PyBytes_AS_STRING(rep), repsize); 5507 out += moreunits; 5508 } else /* rep is unicode */ { 5509 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5510 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5511 &out, native_ordering); 5512 } 5513 5514 Py_CLEAR(rep); 5515 } 5516 5517 /* Cut back to size actually needed. This is necessary for, for example, 5518 encoding of a string containing isolated surrogates and the 'ignore' 5519 handler is used. */ 5520 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5521 if (nsize != PyBytes_GET_SIZE(v)) 5522 _PyBytes_Resize(&v, nsize); 5523 Py_XDECREF(errorHandler); 5524 Py_XDECREF(exc); 5525 done: 5526 return v; 5527 error: 5528 Py_XDECREF(rep); 5529 Py_XDECREF(errorHandler); 5530 Py_XDECREF(exc); 5531 Py_XDECREF(v); 5532 return NULL; 5533 } 5534 5535 PyObject * 5536 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5537 Py_ssize_t size, 5538 const char *errors, 5539 int byteorder) 5540 { 5541 PyObject *result; 5542 PyObject *tmp = PyUnicode_FromWideChar(s, size); 5543 if (tmp == NULL) 5544 return NULL; 5545 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5546 Py_DECREF(tmp); 5547 return result; 5548 } 5549 5550 PyObject * 5551 PyUnicode_AsUTF32String(PyObject *unicode) 5552 { 5553 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5554 } 5555 5556 /* --- UTF-16 Codec ------------------------------------------------------- */ 5557 5558 PyObject * 5559 PyUnicode_DecodeUTF16(const char *s, 5560 Py_ssize_t size, 5561 const char *errors, 5562 int *byteorder) 5563 { 5564 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5565 } 5566 5567 PyObject * 5568 PyUnicode_DecodeUTF16Stateful(const char *s, 5569 Py_ssize_t size, 5570 const char *errors, 5571 int *byteorder, 5572 Py_ssize_t *consumed) 5573 { 5574 const char *starts = s; 5575 Py_ssize_t startinpos; 5576 Py_ssize_t endinpos; 5577 _PyUnicodeWriter writer; 5578 const unsigned char *q, *e; 5579 int bo = 0; /* assume native ordering by default */ 5580 int native_ordering; 5581 const char *errmsg = ""; 5582 PyObject *errorHandler = NULL; 5583 PyObject *exc = NULL; 5584 const char *encoding; 5585 5586 q = (unsigned char *)s; 5587 e = q + size; 5588 5589 if (byteorder) 5590 bo = *byteorder; 5591 5592 /* Check for BOM marks (U+FEFF) in the input and adjust current 5593 byte order setting accordingly. In native mode, the leading BOM 5594 mark is skipped, in all other modes, it is copied to the output 5595 stream as-is (giving a ZWNBSP character). */ 5596 if (bo == 0 && size >= 2) { 5597 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5598 if (bom == 0xFEFF) { 5599 q += 2; 5600 bo = -1; 5601 } 5602 else if (bom == 0xFFFE) { 5603 q += 2; 5604 bo = 1; 5605 } 5606 if (byteorder) 5607 *byteorder = bo; 5608 } 5609 5610 if (q == e) { 5611 if (consumed) 5612 *consumed = size; 5613 _Py_RETURN_UNICODE_EMPTY(); 5614 } 5615 5616 #if PY_LITTLE_ENDIAN 5617 native_ordering = bo <= 0; 5618 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5619 #else 5620 native_ordering = bo >= 0; 5621 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5622 #endif 5623 5624 /* Note: size will always be longer than the resulting Unicode 5625 character count normally. Error handler will take care of 5626 resizing when needed. */ 5627 _PyUnicodeWriter_Init(&writer); 5628 writer.min_length = (e - q + 1) / 2; 5629 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5630 goto onError; 5631 5632 while (1) { 5633 Py_UCS4 ch = 0; 5634 if (e - q >= 2) { 5635 int kind = writer.kind; 5636 if (kind == PyUnicode_1BYTE_KIND) { 5637 if (PyUnicode_IS_ASCII(writer.buffer)) 5638 ch = asciilib_utf16_decode(&q, e, 5639 (Py_UCS1*)writer.data, &writer.pos, 5640 native_ordering); 5641 else 5642 ch = ucs1lib_utf16_decode(&q, e, 5643 (Py_UCS1*)writer.data, &writer.pos, 5644 native_ordering); 5645 } else if (kind == PyUnicode_2BYTE_KIND) { 5646 ch = ucs2lib_utf16_decode(&q, e, 5647 (Py_UCS2*)writer.data, &writer.pos, 5648 native_ordering); 5649 } else { 5650 assert(kind == PyUnicode_4BYTE_KIND); 5651 ch = ucs4lib_utf16_decode(&q, e, 5652 (Py_UCS4*)writer.data, &writer.pos, 5653 native_ordering); 5654 } 5655 } 5656 5657 switch (ch) 5658 { 5659 case 0: 5660 /* remaining byte at the end? (size should be even) */ 5661 if (q == e || consumed) 5662 goto End; 5663 errmsg = "truncated data"; 5664 startinpos = ((const char *)q) - starts; 5665 endinpos = ((const char *)e) - starts; 5666 break; 5667 /* The remaining input chars are ignored if the callback 5668 chooses to skip the input */ 5669 case 1: 5670 q -= 2; 5671 if (consumed) 5672 goto End; 5673 errmsg = "unexpected end of data"; 5674 startinpos = ((const char *)q) - starts; 5675 endinpos = ((const char *)e) - starts; 5676 break; 5677 case 2: 5678 errmsg = "illegal encoding"; 5679 startinpos = ((const char *)q) - 2 - starts; 5680 endinpos = startinpos + 2; 5681 break; 5682 case 3: 5683 errmsg = "illegal UTF-16 surrogate"; 5684 startinpos = ((const char *)q) - 4 - starts; 5685 endinpos = startinpos + 2; 5686 break; 5687 default: 5688 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5689 goto onError; 5690 continue; 5691 } 5692 5693 if (unicode_decode_call_errorhandler_writer( 5694 errors, 5695 &errorHandler, 5696 encoding, errmsg, 5697 &starts, 5698 (const char **)&e, 5699 &startinpos, 5700 &endinpos, 5701 &exc, 5702 (const char **)&q, 5703 &writer)) 5704 goto onError; 5705 } 5706 5707 End: 5708 if (consumed) 5709 *consumed = (const char *)q-starts; 5710 5711 Py_XDECREF(errorHandler); 5712 Py_XDECREF(exc); 5713 return _PyUnicodeWriter_Finish(&writer); 5714 5715 onError: 5716 _PyUnicodeWriter_Dealloc(&writer); 5717 Py_XDECREF(errorHandler); 5718 Py_XDECREF(exc); 5719 return NULL; 5720 } 5721 5722 PyObject * 5723 _PyUnicode_EncodeUTF16(PyObject *str, 5724 const char *errors, 5725 int byteorder) 5726 { 5727 enum PyUnicode_Kind kind; 5728 const void *data; 5729 Py_ssize_t len; 5730 PyObject *v; 5731 unsigned short *out; 5732 Py_ssize_t pairs; 5733 #if PY_BIG_ENDIAN 5734 int native_ordering = byteorder >= 0; 5735 #else 5736 int native_ordering = byteorder <= 0; 5737 #endif 5738 const char *encoding; 5739 Py_ssize_t nsize, pos; 5740 PyObject *errorHandler = NULL; 5741 PyObject *exc = NULL; 5742 PyObject *rep = NULL; 5743 5744 if (!PyUnicode_Check(str)) { 5745 PyErr_BadArgument(); 5746 return NULL; 5747 } 5748 if (PyUnicode_READY(str) == -1) 5749 return NULL; 5750 kind = PyUnicode_KIND(str); 5751 data = PyUnicode_DATA(str); 5752 len = PyUnicode_GET_LENGTH(str); 5753 5754 pairs = 0; 5755 if (kind == PyUnicode_4BYTE_KIND) { 5756 const Py_UCS4 *in = (const Py_UCS4 *)data; 5757 const Py_UCS4 *end = in + len; 5758 while (in < end) { 5759 if (*in++ >= 0x10000) { 5760 pairs++; 5761 } 5762 } 5763 } 5764 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { 5765 return PyErr_NoMemory(); 5766 } 5767 nsize = len + pairs + (byteorder == 0); 5768 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5769 if (v == NULL) { 5770 return NULL; 5771 } 5772 5773 /* output buffer is 2-bytes aligned */ 5774 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5775 out = (unsigned short *)PyBytes_AS_STRING(v); 5776 if (byteorder == 0) { 5777 *out++ = 0xFEFF; 5778 } 5779 if (len == 0) { 5780 goto done; 5781 } 5782 5783 if (kind == PyUnicode_1BYTE_KIND) { 5784 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5785 goto done; 5786 } 5787 5788 if (byteorder < 0) { 5789 encoding = "utf-16-le"; 5790 } 5791 else if (byteorder > 0) { 5792 encoding = "utf-16-be"; 5793 } 5794 else { 5795 encoding = "utf-16"; 5796 } 5797 5798 pos = 0; 5799 while (pos < len) { 5800 Py_ssize_t repsize, moreunits; 5801 5802 if (kind == PyUnicode_2BYTE_KIND) { 5803 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5804 &out, native_ordering); 5805 } 5806 else { 5807 assert(kind == PyUnicode_4BYTE_KIND); 5808 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5809 &out, native_ordering); 5810 } 5811 if (pos == len) 5812 break; 5813 5814 rep = unicode_encode_call_errorhandler( 5815 errors, &errorHandler, 5816 encoding, "surrogates not allowed", 5817 str, &exc, pos, pos + 1, &pos); 5818 if (!rep) 5819 goto error; 5820 5821 if (PyBytes_Check(rep)) { 5822 repsize = PyBytes_GET_SIZE(rep); 5823 if (repsize & 1) { 5824 raise_encode_exception(&exc, encoding, 5825 str, pos - 1, pos, 5826 "surrogates not allowed"); 5827 goto error; 5828 } 5829 moreunits = repsize / 2; 5830 } 5831 else { 5832 assert(PyUnicode_Check(rep)); 5833 if (PyUnicode_READY(rep) < 0) 5834 goto error; 5835 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5836 if (!PyUnicode_IS_ASCII(rep)) { 5837 raise_encode_exception(&exc, encoding, 5838 str, pos - 1, pos, 5839 "surrogates not allowed"); 5840 goto error; 5841 } 5842 } 5843 5844 /* two bytes are reserved for each surrogate */ 5845 if (moreunits > 1) { 5846 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5847 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { 5848 /* integer overflow */ 5849 PyErr_NoMemory(); 5850 goto error; 5851 } 5852 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0) 5853 goto error; 5854 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5855 } 5856 5857 if (PyBytes_Check(rep)) { 5858 memcpy(out, PyBytes_AS_STRING(rep), repsize); 5859 out += moreunits; 5860 } else /* rep is unicode */ { 5861 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5862 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5863 &out, native_ordering); 5864 } 5865 5866 Py_CLEAR(rep); 5867 } 5868 5869 /* Cut back to size actually needed. This is necessary for, for example, 5870 encoding of a string containing isolated surrogates and the 'ignore' handler 5871 is used. */ 5872 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5873 if (nsize != PyBytes_GET_SIZE(v)) 5874 _PyBytes_Resize(&v, nsize); 5875 Py_XDECREF(errorHandler); 5876 Py_XDECREF(exc); 5877 done: 5878 return v; 5879 error: 5880 Py_XDECREF(rep); 5881 Py_XDECREF(errorHandler); 5882 Py_XDECREF(exc); 5883 Py_XDECREF(v); 5884 return NULL; 5885 #undef STORECHAR 5886 } 5887 5888 PyObject * 5889 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5890 Py_ssize_t size, 5891 const char *errors, 5892 int byteorder) 5893 { 5894 PyObject *result; 5895 PyObject *tmp = PyUnicode_FromWideChar(s, size); 5896 if (tmp == NULL) 5897 return NULL; 5898 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5899 Py_DECREF(tmp); 5900 return result; 5901 } 5902 5903 PyObject * 5904 PyUnicode_AsUTF16String(PyObject *unicode) 5905 { 5906 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5907 } 5908 5909 /* --- Unicode Escape Codec ----------------------------------------------- */ 5910 5911 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5912 5913 PyObject * 5914 _PyUnicode_DecodeUnicodeEscape(const char *s, 5915 Py_ssize_t size, 5916 const char *errors, 5917 const char **first_invalid_escape) 5918 { 5919 const char *starts = s; 5920 _PyUnicodeWriter writer; 5921 const char *end; 5922 PyObject *errorHandler = NULL; 5923 PyObject *exc = NULL; 5924 5925 // so we can remember if we've seen an invalid escape char or not 5926 *first_invalid_escape = NULL; 5927 5928 if (size == 0) { 5929 _Py_RETURN_UNICODE_EMPTY(); 5930 } 5931 /* Escaped strings will always be longer than the resulting 5932 Unicode string, so we start with size here and then reduce the 5933 length after conversion to the true value. 5934 (but if the error callback returns a long replacement string 5935 we'll have to allocate more space) */ 5936 _PyUnicodeWriter_Init(&writer); 5937 writer.min_length = size; 5938 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 5939 goto onError; 5940 } 5941 5942 end = s + size; 5943 while (s < end) { 5944 unsigned char c = (unsigned char) *s++; 5945 Py_UCS4 ch; 5946 int count; 5947 Py_ssize_t startinpos; 5948 Py_ssize_t endinpos; 5949 const char *message; 5950 5951 #define WRITE_ASCII_CHAR(ch) \ 5952 do { \ 5953 assert(ch <= 127); \ 5954 assert(writer.pos < writer.size); \ 5955 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5956 } while(0) 5957 5958 #define WRITE_CHAR(ch) \ 5959 do { \ 5960 if (ch <= writer.maxchar) { \ 5961 assert(writer.pos < writer.size); \ 5962 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 5963 } \ 5964 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 5965 goto onError; \ 5966 } \ 5967 } while(0) 5968 5969 /* Non-escape characters are interpreted as Unicode ordinals */ 5970 if (c != '\\') { 5971 WRITE_CHAR(c); 5972 continue; 5973 } 5974 5975 startinpos = s - starts - 1; 5976 /* \ - Escapes */ 5977 if (s >= end) { 5978 message = "\\ at end of string"; 5979 goto error; 5980 } 5981 c = (unsigned char) *s++; 5982 5983 assert(writer.pos < writer.size); 5984 switch (c) { 5985 5986 /* \x escapes */ 5987 case '\n': continue; 5988 case '\\': WRITE_ASCII_CHAR('\\'); continue; 5989 case '\'': WRITE_ASCII_CHAR('\''); continue; 5990 case '\"': WRITE_ASCII_CHAR('\"'); continue; 5991 case 'b': WRITE_ASCII_CHAR('\b'); continue; 5992 /* FF */ 5993 case 'f': WRITE_ASCII_CHAR('\014'); continue; 5994 case 't': WRITE_ASCII_CHAR('\t'); continue; 5995 case 'n': WRITE_ASCII_CHAR('\n'); continue; 5996 case 'r': WRITE_ASCII_CHAR('\r'); continue; 5997 /* VT */ 5998 case 'v': WRITE_ASCII_CHAR('\013'); continue; 5999 /* BEL, not classic C */ 6000 case 'a': WRITE_ASCII_CHAR('\007'); continue; 6001 6002 /* \OOO (octal) escapes */ 6003 case '0': case '1': case '2': case '3': 6004 case '4': case '5': case '6': case '7': 6005 ch = c - '0'; 6006 if (s < end && '0' <= *s && *s <= '7') { 6007 ch = (ch<<3) + *s++ - '0'; 6008 if (s < end && '0' <= *s && *s <= '7') { 6009 ch = (ch<<3) + *s++ - '0'; 6010 } 6011 } 6012 WRITE_CHAR(ch); 6013 continue; 6014 6015 /* hex escapes */ 6016 /* \xXX */ 6017 case 'x': 6018 count = 2; 6019 message = "truncated \\xXX escape"; 6020 goto hexescape; 6021 6022 /* \uXXXX */ 6023 case 'u': 6024 count = 4; 6025 message = "truncated \\uXXXX escape"; 6026 goto hexescape; 6027 6028 /* \UXXXXXXXX */ 6029 case 'U': 6030 count = 8; 6031 message = "truncated \\UXXXXXXXX escape"; 6032 hexescape: 6033 for (ch = 0; count && s < end; ++s, --count) { 6034 c = (unsigned char)*s; 6035 ch <<= 4; 6036 if (c >= '0' && c <= '9') { 6037 ch += c - '0'; 6038 } 6039 else if (c >= 'a' && c <= 'f') { 6040 ch += c - ('a' - 10); 6041 } 6042 else if (c >= 'A' && c <= 'F') { 6043 ch += c - ('A' - 10); 6044 } 6045 else { 6046 break; 6047 } 6048 } 6049 if (count) { 6050 goto error; 6051 } 6052 6053 /* when we get here, ch is a 32-bit unicode character */ 6054 if (ch > MAX_UNICODE) { 6055 message = "illegal Unicode character"; 6056 goto error; 6057 } 6058 6059 WRITE_CHAR(ch); 6060 continue; 6061 6062 /* \N{name} */ 6063 case 'N': 6064 if (ucnhash_CAPI == NULL) { 6065 /* load the unicode data module */ 6066 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 6067 PyUnicodeData_CAPSULE_NAME, 1); 6068 if (ucnhash_CAPI == NULL) { 6069 PyErr_SetString( 6070 PyExc_UnicodeError, 6071 "\\N escapes not supported (can't load unicodedata module)" 6072 ); 6073 goto onError; 6074 } 6075 } 6076 6077 message = "malformed \\N character escape"; 6078 if (s < end && *s == '{') { 6079 const char *start = ++s; 6080 size_t namelen; 6081 /* look for the closing brace */ 6082 while (s < end && *s != '}') 6083 s++; 6084 namelen = s - start; 6085 if (namelen && s < end) { 6086 /* found a name. look it up in the unicode database */ 6087 s++; 6088 ch = 0xffffffff; /* in case 'getcode' messes up */ 6089 if (namelen <= INT_MAX && 6090 ucnhash_CAPI->getcode(NULL, start, (int)namelen, 6091 &ch, 0)) { 6092 assert(ch <= MAX_UNICODE); 6093 WRITE_CHAR(ch); 6094 continue; 6095 } 6096 message = "unknown Unicode character name"; 6097 } 6098 } 6099 goto error; 6100 6101 default: 6102 if (*first_invalid_escape == NULL) { 6103 *first_invalid_escape = s-1; /* Back up one char, since we've 6104 already incremented s. */ 6105 } 6106 WRITE_ASCII_CHAR('\\'); 6107 WRITE_CHAR(c); 6108 continue; 6109 } 6110 6111 error: 6112 endinpos = s-starts; 6113 writer.min_length = end - s + writer.pos; 6114 if (unicode_decode_call_errorhandler_writer( 6115 errors, &errorHandler, 6116 "unicodeescape", message, 6117 &starts, &end, &startinpos, &endinpos, &exc, &s, 6118 &writer)) { 6119 goto onError; 6120 } 6121 assert(end - s <= writer.size - writer.pos); 6122 6123 #undef WRITE_ASCII_CHAR 6124 #undef WRITE_CHAR 6125 } 6126 6127 Py_XDECREF(errorHandler); 6128 Py_XDECREF(exc); 6129 return _PyUnicodeWriter_Finish(&writer); 6130 6131 onError: 6132 _PyUnicodeWriter_Dealloc(&writer); 6133 Py_XDECREF(errorHandler); 6134 Py_XDECREF(exc); 6135 return NULL; 6136 } 6137 6138 PyObject * 6139 PyUnicode_DecodeUnicodeEscape(const char *s, 6140 Py_ssize_t size, 6141 const char *errors) 6142 { 6143 const char *first_invalid_escape; 6144 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, 6145 &first_invalid_escape); 6146 if (result == NULL) 6147 return NULL; 6148 if (first_invalid_escape != NULL) { 6149 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, 6150 "invalid escape sequence '\\%c'", 6151 (unsigned char)*first_invalid_escape) < 0) { 6152 Py_DECREF(result); 6153 return NULL; 6154 } 6155 } 6156 return result; 6157 } 6158 6159 /* Return a Unicode-Escape string version of the Unicode object. */ 6160 6161 PyObject * 6162 PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6163 { 6164 Py_ssize_t i, len; 6165 PyObject *repr; 6166 char *p; 6167 enum PyUnicode_Kind kind; 6168 void *data; 6169 Py_ssize_t expandsize; 6170 6171 /* Initial allocation is based on the longest-possible character 6172 escape. 6173 6174 For UCS1 strings it's '\xxx', 4 bytes per source character. 6175 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 6176 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 6177 */ 6178 6179 if (!PyUnicode_Check(unicode)) { 6180 PyErr_BadArgument(); 6181 return NULL; 6182 } 6183 if (PyUnicode_READY(unicode) == -1) { 6184 return NULL; 6185 } 6186 6187 len = PyUnicode_GET_LENGTH(unicode); 6188 if (len == 0) { 6189 return PyBytes_FromStringAndSize(NULL, 0); 6190 } 6191 6192 kind = PyUnicode_KIND(unicode); 6193 data = PyUnicode_DATA(unicode); 6194 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6195 bytes, and 1 byte characters 4. */ 6196 expandsize = kind * 2 + 2; 6197 if (len > PY_SSIZE_T_MAX / expandsize) { 6198 return PyErr_NoMemory(); 6199 } 6200 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6201 if (repr == NULL) { 6202 return NULL; 6203 } 6204 6205 p = PyBytes_AS_STRING(repr); 6206 for (i = 0; i < len; i++) { 6207 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6208 6209 /* U+0000-U+00ff range */ 6210 if (ch < 0x100) { 6211 if (ch >= ' ' && ch < 127) { 6212 if (ch != '\\') { 6213 /* Copy printable US ASCII as-is */ 6214 *p++ = (char) ch; 6215 } 6216 /* Escape backslashes */ 6217 else { 6218 *p++ = '\\'; 6219 *p++ = '\\'; 6220 } 6221 } 6222 6223 /* Map special whitespace to '\t', \n', '\r' */ 6224 else if (ch == '\t') { 6225 *p++ = '\\'; 6226 *p++ = 't'; 6227 } 6228 else if (ch == '\n') { 6229 *p++ = '\\'; 6230 *p++ = 'n'; 6231 } 6232 else if (ch == '\r') { 6233 *p++ = '\\'; 6234 *p++ = 'r'; 6235 } 6236 6237 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */ 6238 else { 6239 *p++ = '\\'; 6240 *p++ = 'x'; 6241 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6242 *p++ = Py_hexdigits[ch & 0x000F]; 6243 } 6244 } 6245 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */ 6246 else if (ch < 0x10000) { 6247 *p++ = '\\'; 6248 *p++ = 'u'; 6249 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6250 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6251 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6252 *p++ = Py_hexdigits[ch & 0x000F]; 6253 } 6254 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */ 6255 else { 6256 6257 /* Make sure that the first two digits are zero */ 6258 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6259 *p++ = '\\'; 6260 *p++ = 'U'; 6261 *p++ = '0'; 6262 *p++ = '0'; 6263 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 6264 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 6265 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 6266 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 6267 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 6268 *p++ = Py_hexdigits[ch & 0x0000000F]; 6269 } 6270 } 6271 6272 assert(p - PyBytes_AS_STRING(repr) > 0); 6273 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6274 return NULL; 6275 } 6276 return repr; 6277 } 6278 6279 PyObject * 6280 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6281 Py_ssize_t size) 6282 { 6283 PyObject *result; 6284 PyObject *tmp = PyUnicode_FromWideChar(s, size); 6285 if (tmp == NULL) { 6286 return NULL; 6287 } 6288 6289 result = PyUnicode_AsUnicodeEscapeString(tmp); 6290 Py_DECREF(tmp); 6291 return result; 6292 } 6293 6294 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 6295 6296 PyObject * 6297 PyUnicode_DecodeRawUnicodeEscape(const char *s, 6298 Py_ssize_t size, 6299 const char *errors) 6300 { 6301 const char *starts = s; 6302 _PyUnicodeWriter writer; 6303 const char *end; 6304 PyObject *errorHandler = NULL; 6305 PyObject *exc = NULL; 6306 6307 if (size == 0) { 6308 _Py_RETURN_UNICODE_EMPTY(); 6309 } 6310 6311 /* Escaped strings will always be longer than the resulting 6312 Unicode string, so we start with size here and then reduce the 6313 length after conversion to the true value. (But decoding error 6314 handler might have to resize the string) */ 6315 _PyUnicodeWriter_Init(&writer); 6316 writer.min_length = size; 6317 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { 6318 goto onError; 6319 } 6320 6321 end = s + size; 6322 while (s < end) { 6323 unsigned char c = (unsigned char) *s++; 6324 Py_UCS4 ch; 6325 int count; 6326 Py_ssize_t startinpos; 6327 Py_ssize_t endinpos; 6328 const char *message; 6329 6330 #define WRITE_CHAR(ch) \ 6331 do { \ 6332 if (ch <= writer.maxchar) { \ 6333 assert(writer.pos < writer.size); \ 6334 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ 6335 } \ 6336 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ 6337 goto onError; \ 6338 } \ 6339 } while(0) 6340 6341 /* Non-escape characters are interpreted as Unicode ordinals */ 6342 if (c != '\\' || s >= end) { 6343 WRITE_CHAR(c); 6344 continue; 6345 } 6346 6347 c = (unsigned char) *s++; 6348 if (c == 'u') { 6349 count = 4; 6350 message = "truncated \\uXXXX escape"; 6351 } 6352 else if (c == 'U') { 6353 count = 8; 6354 message = "truncated \\UXXXXXXXX escape"; 6355 } 6356 else { 6357 assert(writer.pos < writer.size); 6358 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\'); 6359 WRITE_CHAR(c); 6360 continue; 6361 } 6362 startinpos = s - starts - 2; 6363 6364 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ 6365 for (ch = 0; count && s < end; ++s, --count) { 6366 c = (unsigned char)*s; 6367 ch <<= 4; 6368 if (c >= '0' && c <= '9') { 6369 ch += c - '0'; 6370 } 6371 else if (c >= 'a' && c <= 'f') { 6372 ch += c - ('a' - 10); 6373 } 6374 else if (c >= 'A' && c <= 'F') { 6375 ch += c - ('A' - 10); 6376 } 6377 else { 6378 break; 6379 } 6380 } 6381 if (!count) { 6382 if (ch <= MAX_UNICODE) { 6383 WRITE_CHAR(ch); 6384 continue; 6385 } 6386 message = "\\Uxxxxxxxx out of range"; 6387 } 6388 6389 endinpos = s-starts; 6390 writer.min_length = end - s + writer.pos; 6391 if (unicode_decode_call_errorhandler_writer( 6392 errors, &errorHandler, 6393 "rawunicodeescape", message, 6394 &starts, &end, &startinpos, &endinpos, &exc, &s, 6395 &writer)) { 6396 goto onError; 6397 } 6398 assert(end - s <= writer.size - writer.pos); 6399 6400 #undef WRITE_CHAR 6401 } 6402 Py_XDECREF(errorHandler); 6403 Py_XDECREF(exc); 6404 return _PyUnicodeWriter_Finish(&writer); 6405 6406 onError: 6407 _PyUnicodeWriter_Dealloc(&writer); 6408 Py_XDECREF(errorHandler); 6409 Py_XDECREF(exc); 6410 return NULL; 6411 6412 } 6413 6414 6415 PyObject * 6416 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6417 { 6418 PyObject *repr; 6419 char *p; 6420 Py_ssize_t expandsize, pos; 6421 int kind; 6422 void *data; 6423 Py_ssize_t len; 6424 6425 if (!PyUnicode_Check(unicode)) { 6426 PyErr_BadArgument(); 6427 return NULL; 6428 } 6429 if (PyUnicode_READY(unicode) == -1) { 6430 return NULL; 6431 } 6432 kind = PyUnicode_KIND(unicode); 6433 data = PyUnicode_DATA(unicode); 6434 len = PyUnicode_GET_LENGTH(unicode); 6435 if (kind == PyUnicode_1BYTE_KIND) { 6436 return PyBytes_FromStringAndSize(data, len); 6437 } 6438 6439 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6440 bytes, and 1 byte characters 4. */ 6441 expandsize = kind * 2 + 2; 6442 6443 if (len > PY_SSIZE_T_MAX / expandsize) { 6444 return PyErr_NoMemory(); 6445 } 6446 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6447 if (repr == NULL) { 6448 return NULL; 6449 } 6450 if (len == 0) { 6451 return repr; 6452 } 6453 6454 p = PyBytes_AS_STRING(repr); 6455 for (pos = 0; pos < len; pos++) { 6456 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6457 6458 /* U+0000-U+00ff range: Copy 8-bit characters as-is */ 6459 if (ch < 0x100) { 6460 *p++ = (char) ch; 6461 } 6462 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */ 6463 else if (ch < 0x10000) { 6464 *p++ = '\\'; 6465 *p++ = 'u'; 6466 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6467 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6468 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6469 *p++ = Py_hexdigits[ch & 15]; 6470 } 6471 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */ 6472 else { 6473 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); 6474 *p++ = '\\'; 6475 *p++ = 'U'; 6476 *p++ = '0'; 6477 *p++ = '0'; 6478 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6479 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6480 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6481 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6482 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6483 *p++ = Py_hexdigits[ch & 15]; 6484 } 6485 } 6486 6487 assert(p > PyBytes_AS_STRING(repr)); 6488 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { 6489 return NULL; 6490 } 6491 return repr; 6492 } 6493 6494 PyObject * 6495 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6496 Py_ssize_t size) 6497 { 6498 PyObject *result; 6499 PyObject *tmp = PyUnicode_FromWideChar(s, size); 6500 if (tmp == NULL) 6501 return NULL; 6502 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6503 Py_DECREF(tmp); 6504 return result; 6505 } 6506 6507 /* --- Unicode Internal Codec ------------------------------------------- */ 6508 6509 PyObject * 6510 _PyUnicode_DecodeUnicodeInternal(const char *s, 6511 Py_ssize_t size, 6512 const char *errors) 6513 { 6514 const char *starts = s; 6515 Py_ssize_t startinpos; 6516 Py_ssize_t endinpos; 6517 _PyUnicodeWriter writer; 6518 const char *end; 6519 const char *reason; 6520 PyObject *errorHandler = NULL; 6521 PyObject *exc = NULL; 6522 6523 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6524 "unicode_internal codec has been deprecated", 6525 1)) 6526 return NULL; 6527 6528 if (size < 0) { 6529 PyErr_BadInternalCall(); 6530 return NULL; 6531 } 6532 if (size == 0) 6533 _Py_RETURN_UNICODE_EMPTY(); 6534 6535 _PyUnicodeWriter_Init(&writer); 6536 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6537 PyErr_NoMemory(); 6538 goto onError; 6539 } 6540 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6541 6542 end = s + size; 6543 while (s < end) { 6544 Py_UNICODE uch; 6545 Py_UCS4 ch; 6546 if (end - s < Py_UNICODE_SIZE) { 6547 endinpos = end-starts; 6548 reason = "truncated input"; 6549 goto error; 6550 } 6551 /* We copy the raw representation one byte at a time because the 6552 pointer may be unaligned (see test_codeccallbacks). */ 6553 ((char *) &uch)[0] = s[0]; 6554 ((char *) &uch)[1] = s[1]; 6555 #ifdef Py_UNICODE_WIDE 6556 ((char *) &uch)[2] = s[2]; 6557 ((char *) &uch)[3] = s[3]; 6558 #endif 6559 ch = uch; 6560 #ifdef Py_UNICODE_WIDE 6561 /* We have to sanity check the raw data, otherwise doom looms for 6562 some malformed UCS-4 data. */ 6563 if (ch > 0x10ffff) { 6564 endinpos = s - starts + Py_UNICODE_SIZE; 6565 reason = "illegal code point (> 0x10FFFF)"; 6566 goto error; 6567 } 6568 #endif 6569 s += Py_UNICODE_SIZE; 6570 #ifndef Py_UNICODE_WIDE 6571 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6572 { 6573 Py_UNICODE uch2; 6574 ((char *) &uch2)[0] = s[0]; 6575 ((char *) &uch2)[1] = s[1]; 6576 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6577 { 6578 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6579 s += Py_UNICODE_SIZE; 6580 } 6581 } 6582 #endif 6583 6584 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6585 goto onError; 6586 continue; 6587 6588 error: 6589 startinpos = s - starts; 6590 if (unicode_decode_call_errorhandler_writer( 6591 errors, &errorHandler, 6592 "unicode_internal", reason, 6593 &starts, &end, &startinpos, &endinpos, &exc, &s, 6594 &writer)) 6595 goto onError; 6596 } 6597 6598 Py_XDECREF(errorHandler); 6599 Py_XDECREF(exc); 6600 return _PyUnicodeWriter_Finish(&writer); 6601 6602 onError: 6603 _PyUnicodeWriter_Dealloc(&writer); 6604 Py_XDECREF(errorHandler); 6605 Py_XDECREF(exc); 6606 return NULL; 6607 } 6608 6609 /* --- Latin-1 Codec ------------------------------------------------------ */ 6610 6611 PyObject * 6612 PyUnicode_DecodeLatin1(const char *s, 6613 Py_ssize_t size, 6614 const char *errors) 6615 { 6616 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6617 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6618 } 6619 6620 /* create or adjust a UnicodeEncodeError */ 6621 static void 6622 make_encode_exception(PyObject **exceptionObject, 6623 const char *encoding, 6624 PyObject *unicode, 6625 Py_ssize_t startpos, Py_ssize_t endpos, 6626 const char *reason) 6627 { 6628 if (*exceptionObject == NULL) { 6629 *exceptionObject = PyObject_CallFunction( 6630 PyExc_UnicodeEncodeError, "sOnns", 6631 encoding, unicode, startpos, endpos, reason); 6632 } 6633 else { 6634 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6635 goto onError; 6636 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6637 goto onError; 6638 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6639 goto onError; 6640 return; 6641 onError: 6642 Py_CLEAR(*exceptionObject); 6643 } 6644 } 6645 6646 /* raises a UnicodeEncodeError */ 6647 static void 6648 raise_encode_exception(PyObject **exceptionObject, 6649 const char *encoding, 6650 PyObject *unicode, 6651 Py_ssize_t startpos, Py_ssize_t endpos, 6652 const char *reason) 6653 { 6654 make_encode_exception(exceptionObject, 6655 encoding, unicode, startpos, endpos, reason); 6656 if (*exceptionObject != NULL) 6657 PyCodec_StrictErrors(*exceptionObject); 6658 } 6659 6660 /* error handling callback helper: 6661 build arguments, call the callback and check the arguments, 6662 put the result into newpos and return the replacement string, which 6663 has to be freed by the caller */ 6664 static PyObject * 6665 unicode_encode_call_errorhandler(const char *errors, 6666 PyObject **errorHandler, 6667 const char *encoding, const char *reason, 6668 PyObject *unicode, PyObject **exceptionObject, 6669 Py_ssize_t startpos, Py_ssize_t endpos, 6670 Py_ssize_t *newpos) 6671 { 6672 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6673 Py_ssize_t len; 6674 PyObject *restuple; 6675 PyObject *resunicode; 6676 6677 if (*errorHandler == NULL) { 6678 *errorHandler = PyCodec_LookupError(errors); 6679 if (*errorHandler == NULL) 6680 return NULL; 6681 } 6682 6683 if (PyUnicode_READY(unicode) == -1) 6684 return NULL; 6685 len = PyUnicode_GET_LENGTH(unicode); 6686 6687 make_encode_exception(exceptionObject, 6688 encoding, unicode, startpos, endpos, reason); 6689 if (*exceptionObject == NULL) 6690 return NULL; 6691 6692 restuple = PyObject_CallFunctionObjArgs( 6693 *errorHandler, *exceptionObject, NULL); 6694 if (restuple == NULL) 6695 return NULL; 6696 if (!PyTuple_Check(restuple)) { 6697 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6698 Py_DECREF(restuple); 6699 return NULL; 6700 } 6701 if (!PyArg_ParseTuple(restuple, argparse, 6702 &resunicode, newpos)) { 6703 Py_DECREF(restuple); 6704 return NULL; 6705 } 6706 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6707 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6708 Py_DECREF(restuple); 6709 return NULL; 6710 } 6711 if (*newpos<0) 6712 *newpos = len + *newpos; 6713 if (*newpos<0 || *newpos>len) { 6714 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6715 Py_DECREF(restuple); 6716 return NULL; 6717 } 6718 Py_INCREF(resunicode); 6719 Py_DECREF(restuple); 6720 return resunicode; 6721 } 6722 6723 static PyObject * 6724 unicode_encode_ucs1(PyObject *unicode, 6725 const char *errors, 6726 const Py_UCS4 limit) 6727 { 6728 /* input state */ 6729 Py_ssize_t pos=0, size; 6730 int kind; 6731 void *data; 6732 /* pointer into the output */ 6733 char *str; 6734 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6735 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6736 PyObject *error_handler_obj = NULL; 6737 PyObject *exc = NULL; 6738 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6739 PyObject *rep = NULL; 6740 /* output object */ 6741 _PyBytesWriter writer; 6742 6743 if (PyUnicode_READY(unicode) == -1) 6744 return NULL; 6745 size = PyUnicode_GET_LENGTH(unicode); 6746 kind = PyUnicode_KIND(unicode); 6747 data = PyUnicode_DATA(unicode); 6748 /* allocate enough for a simple encoding without 6749 replacements, if we need more, we'll resize */ 6750 if (size == 0) 6751 return PyBytes_FromStringAndSize(NULL, 0); 6752 6753 _PyBytesWriter_Init(&writer); 6754 str = _PyBytesWriter_Alloc(&writer, size); 6755 if (str == NULL) 6756 return NULL; 6757 6758 while (pos < size) { 6759 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6760 6761 /* can we encode this? */ 6762 if (ch < limit) { 6763 /* no overflow check, because we know that the space is enough */ 6764 *str++ = (char)ch; 6765 ++pos; 6766 } 6767 else { 6768 Py_ssize_t newpos, i; 6769 /* startpos for collecting unencodable chars */ 6770 Py_ssize_t collstart = pos; 6771 Py_ssize_t collend = collstart + 1; 6772 /* find all unecodable characters */ 6773 6774 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6775 ++collend; 6776 6777 /* Only overallocate the buffer if it's not the last write */ 6778 writer.overallocate = (collend < size); 6779 6780 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6781 if (error_handler == _Py_ERROR_UNKNOWN) 6782 error_handler = get_error_handler(errors); 6783 6784 switch (error_handler) { 6785 case _Py_ERROR_STRICT: 6786 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6787 goto onError; 6788 6789 case _Py_ERROR_REPLACE: 6790 memset(str, '?', collend - collstart); 6791 str += (collend - collstart); 6792 /* fall through */ 6793 case _Py_ERROR_IGNORE: 6794 pos = collend; 6795 break; 6796 6797 case _Py_ERROR_BACKSLASHREPLACE: 6798 /* subtract preallocated bytes */ 6799 writer.min_size -= (collend - collstart); 6800 str = backslashreplace(&writer, str, 6801 unicode, collstart, collend); 6802 if (str == NULL) 6803 goto onError; 6804 pos = collend; 6805 break; 6806 6807 case _Py_ERROR_XMLCHARREFREPLACE: 6808 /* subtract preallocated bytes */ 6809 writer.min_size -= (collend - collstart); 6810 str = xmlcharrefreplace(&writer, str, 6811 unicode, collstart, collend); 6812 if (str == NULL) 6813 goto onError; 6814 pos = collend; 6815 break; 6816 6817 case _Py_ERROR_SURROGATEESCAPE: 6818 for (i = collstart; i < collend; ++i) { 6819 ch = PyUnicode_READ(kind, data, i); 6820 if (ch < 0xdc80 || 0xdcff < ch) { 6821 /* Not a UTF-8b surrogate */ 6822 break; 6823 } 6824 *str++ = (char)(ch - 0xdc00); 6825 ++pos; 6826 } 6827 if (i >= collend) 6828 break; 6829 collstart = pos; 6830 assert(collstart != collend); 6831 /* fall through */ 6832 6833 default: 6834 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, 6835 encoding, reason, unicode, &exc, 6836 collstart, collend, &newpos); 6837 if (rep == NULL) 6838 goto onError; 6839 6840 /* subtract preallocated bytes */ 6841 writer.min_size -= newpos - collstart; 6842 6843 if (PyBytes_Check(rep)) { 6844 /* Directly copy bytes result to output. */ 6845 str = _PyBytesWriter_WriteBytes(&writer, str, 6846 PyBytes_AS_STRING(rep), 6847 PyBytes_GET_SIZE(rep)); 6848 } 6849 else { 6850 assert(PyUnicode_Check(rep)); 6851 6852 if (PyUnicode_READY(rep) < 0) 6853 goto onError; 6854 6855 if (limit == 256 ? 6856 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : 6857 !PyUnicode_IS_ASCII(rep)) 6858 { 6859 /* Not all characters are smaller than limit */ 6860 raise_encode_exception(&exc, encoding, unicode, 6861 collstart, collend, reason); 6862 goto onError; 6863 } 6864 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 6865 str = _PyBytesWriter_WriteBytes(&writer, str, 6866 PyUnicode_DATA(rep), 6867 PyUnicode_GET_LENGTH(rep)); 6868 } 6869 if (str == NULL) 6870 goto onError; 6871 6872 pos = newpos; 6873 Py_CLEAR(rep); 6874 } 6875 6876 /* If overallocation was disabled, ensure that it was the last 6877 write. Otherwise, we missed an optimization */ 6878 assert(writer.overallocate || pos == size); 6879 } 6880 } 6881 6882 Py_XDECREF(error_handler_obj); 6883 Py_XDECREF(exc); 6884 return _PyBytesWriter_Finish(&writer, str); 6885 6886 onError: 6887 Py_XDECREF(rep); 6888 _PyBytesWriter_Dealloc(&writer); 6889 Py_XDECREF(error_handler_obj); 6890 Py_XDECREF(exc); 6891 return NULL; 6892 } 6893 6894 /* Deprecated */ 6895 PyObject * 6896 PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6897 Py_ssize_t size, 6898 const char *errors) 6899 { 6900 PyObject *result; 6901 PyObject *unicode = PyUnicode_FromWideChar(p, size); 6902 if (unicode == NULL) 6903 return NULL; 6904 result = unicode_encode_ucs1(unicode, errors, 256); 6905 Py_DECREF(unicode); 6906 return result; 6907 } 6908 6909 PyObject * 6910 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6911 { 6912 if (!PyUnicode_Check(unicode)) { 6913 PyErr_BadArgument(); 6914 return NULL; 6915 } 6916 if (PyUnicode_READY(unicode) == -1) 6917 return NULL; 6918 /* Fast path: if it is a one-byte string, construct 6919 bytes object directly. */ 6920 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6921 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6922 PyUnicode_GET_LENGTH(unicode)); 6923 /* Non-Latin-1 characters present. Defer to above function to 6924 raise the exception. */ 6925 return unicode_encode_ucs1(unicode, errors, 256); 6926 } 6927 6928 PyObject* 6929 PyUnicode_AsLatin1String(PyObject *unicode) 6930 { 6931 return _PyUnicode_AsLatin1String(unicode, NULL); 6932 } 6933 6934 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 6935 6936 PyObject * 6937 PyUnicode_DecodeASCII(const char *s, 6938 Py_ssize_t size, 6939 const char *errors) 6940 { 6941 const char *starts = s; 6942 _PyUnicodeWriter writer; 6943 int kind; 6944 void *data; 6945 Py_ssize_t startinpos; 6946 Py_ssize_t endinpos; 6947 Py_ssize_t outpos; 6948 const char *e; 6949 PyObject *error_handler_obj = NULL; 6950 PyObject *exc = NULL; 6951 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6952 6953 if (size == 0) 6954 _Py_RETURN_UNICODE_EMPTY(); 6955 6956 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6957 if (size == 1 && (unsigned char)s[0] < 128) 6958 return get_latin1_char((unsigned char)s[0]); 6959 6960 _PyUnicodeWriter_Init(&writer); 6961 writer.min_length = size; 6962 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6963 return NULL; 6964 6965 e = s + size; 6966 data = writer.data; 6967 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6968 writer.pos = outpos; 6969 if (writer.pos == size) 6970 return _PyUnicodeWriter_Finish(&writer); 6971 6972 s += writer.pos; 6973 kind = writer.kind; 6974 while (s < e) { 6975 unsigned char c = (unsigned char)*s; 6976 if (c < 128) { 6977 PyUnicode_WRITE(kind, data, writer.pos, c); 6978 writer.pos++; 6979 ++s; 6980 continue; 6981 } 6982 6983 /* byte outsize range 0x00..0x7f: call the error handler */ 6984 6985 if (error_handler == _Py_ERROR_UNKNOWN) 6986 error_handler = get_error_handler(errors); 6987 6988 switch (error_handler) 6989 { 6990 case _Py_ERROR_REPLACE: 6991 case _Py_ERROR_SURROGATEESCAPE: 6992 /* Fast-path: the error handler only writes one character, 6993 but we may switch to UCS2 at the first write */ 6994 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 6995 goto onError; 6996 kind = writer.kind; 6997 data = writer.data; 6998 6999 if (error_handler == _Py_ERROR_REPLACE) 7000 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 7001 else 7002 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 7003 writer.pos++; 7004 ++s; 7005 break; 7006 7007 case _Py_ERROR_IGNORE: 7008 ++s; 7009 break; 7010 7011 default: 7012 startinpos = s-starts; 7013 endinpos = startinpos + 1; 7014 if (unicode_decode_call_errorhandler_writer( 7015 errors, &error_handler_obj, 7016 "ascii", "ordinal not in range(128)", 7017 &starts, &e, &startinpos, &endinpos, &exc, &s, 7018 &writer)) 7019 goto onError; 7020 kind = writer.kind; 7021 data = writer.data; 7022 } 7023 } 7024 Py_XDECREF(error_handler_obj); 7025 Py_XDECREF(exc); 7026 return _PyUnicodeWriter_Finish(&writer); 7027 7028 onError: 7029 _PyUnicodeWriter_Dealloc(&writer); 7030 Py_XDECREF(error_handler_obj); 7031 Py_XDECREF(exc); 7032 return NULL; 7033 } 7034 7035 /* Deprecated */ 7036 PyObject * 7037 PyUnicode_EncodeASCII(const Py_UNICODE *p, 7038 Py_ssize_t size, 7039 const char *errors) 7040 { 7041 PyObject *result; 7042 PyObject *unicode = PyUnicode_FromWideChar(p, size); 7043 if (unicode == NULL) 7044 return NULL; 7045 result = unicode_encode_ucs1(unicode, errors, 128); 7046 Py_DECREF(unicode); 7047 return result; 7048 } 7049 7050 PyObject * 7051 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 7052 { 7053 if (!PyUnicode_Check(unicode)) { 7054 PyErr_BadArgument(); 7055 return NULL; 7056 } 7057 if (PyUnicode_READY(unicode) == -1) 7058 return NULL; 7059 /* Fast path: if it is an ASCII-only string, construct bytes object 7060 directly. Else defer to above function to raise the exception. */ 7061 if (PyUnicode_IS_ASCII(unicode)) 7062 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 7063 PyUnicode_GET_LENGTH(unicode)); 7064 return unicode_encode_ucs1(unicode, errors, 128); 7065 } 7066 7067 PyObject * 7068 PyUnicode_AsASCIIString(PyObject *unicode) 7069 { 7070 return _PyUnicode_AsASCIIString(unicode, NULL); 7071 } 7072 7073 #ifdef MS_WINDOWS 7074 7075 /* --- MBCS codecs for Windows -------------------------------------------- */ 7076 7077 #if SIZEOF_INT < SIZEOF_SIZE_T 7078 #define NEED_RETRY 7079 #endif 7080 7081 #ifndef WC_ERR_INVALID_CHARS 7082 # define WC_ERR_INVALID_CHARS 0x0080 7083 #endif 7084 7085 static const char* 7086 code_page_name(UINT code_page, PyObject **obj) 7087 { 7088 *obj = NULL; 7089 if (code_page == CP_ACP) 7090 return "mbcs"; 7091 if (code_page == CP_UTF7) 7092 return "CP_UTF7"; 7093 if (code_page == CP_UTF8) 7094 return "CP_UTF8"; 7095 7096 *obj = PyBytes_FromFormat("cp%u", code_page); 7097 if (*obj == NULL) 7098 return NULL; 7099 return PyBytes_AS_STRING(*obj); 7100 } 7101 7102 static DWORD 7103 decode_code_page_flags(UINT code_page) 7104 { 7105 if (code_page == CP_UTF7) { 7106 /* The CP_UTF7 decoder only supports flags=0 */ 7107 return 0; 7108 } 7109 else 7110 return MB_ERR_INVALID_CHARS; 7111 } 7112 7113 /* 7114 * Decode a byte string from a Windows code page into unicode object in strict 7115 * mode. 7116 * 7117 * Returns consumed size if succeed, returns -2 on decode error, or raise an 7118 * OSError and returns -1 on other error. 7119 */ 7120 static int 7121 decode_code_page_strict(UINT code_page, 7122 PyObject **v, 7123 const char *in, 7124 int insize) 7125 { 7126 const DWORD flags = decode_code_page_flags(code_page); 7127 wchar_t *out; 7128 DWORD outsize; 7129 7130 /* First get the size of the result */ 7131 assert(insize > 0); 7132 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 7133 if (outsize <= 0) 7134 goto error; 7135 7136 if (*v == NULL) { 7137 /* Create unicode object */ 7138 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7139 *v = (PyObject*)_PyUnicode_New(outsize); 7140 if (*v == NULL) 7141 return -1; 7142 out = PyUnicode_AS_UNICODE(*v); 7143 } 7144 else { 7145 /* Extend unicode object */ 7146 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7147 if (unicode_resize(v, n + outsize) < 0) 7148 return -1; 7149 out = PyUnicode_AS_UNICODE(*v) + n; 7150 } 7151 7152 /* Do the conversion */ 7153 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 7154 if (outsize <= 0) 7155 goto error; 7156 return insize; 7157 7158 error: 7159 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7160 return -2; 7161 PyErr_SetFromWindowsErr(0); 7162 return -1; 7163 } 7164 7165 /* 7166 * Decode a byte string from a code page into unicode object with an error 7167 * handler. 7168 * 7169 * Returns consumed size if succeed, or raise an OSError or 7170 * UnicodeDecodeError exception and returns -1 on error. 7171 */ 7172 static int 7173 decode_code_page_errors(UINT code_page, 7174 PyObject **v, 7175 const char *in, const int size, 7176 const char *errors, int final) 7177 { 7178 const char *startin = in; 7179 const char *endin = in + size; 7180 const DWORD flags = decode_code_page_flags(code_page); 7181 /* Ideally, we should get reason from FormatMessage. This is the Windows 7182 2000 English version of the message. */ 7183 const char *reason = "No mapping for the Unicode character exists " 7184 "in the target code page."; 7185 /* each step cannot decode more than 1 character, but a character can be 7186 represented as a surrogate pair */ 7187 wchar_t buffer[2], *out; 7188 int insize; 7189 Py_ssize_t outsize; 7190 PyObject *errorHandler = NULL; 7191 PyObject *exc = NULL; 7192 PyObject *encoding_obj = NULL; 7193 const char *encoding; 7194 DWORD err; 7195 int ret = -1; 7196 7197 assert(size > 0); 7198 7199 encoding = code_page_name(code_page, &encoding_obj); 7200 if (encoding == NULL) 7201 return -1; 7202 7203 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 7204 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 7205 UnicodeDecodeError. */ 7206 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 7207 if (exc != NULL) { 7208 PyCodec_StrictErrors(exc); 7209 Py_CLEAR(exc); 7210 } 7211 goto error; 7212 } 7213 7214 if (*v == NULL) { 7215 /* Create unicode object */ 7216 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7217 PyErr_NoMemory(); 7218 goto error; 7219 } 7220 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 7221 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 7222 if (*v == NULL) 7223 goto error; 7224 out = PyUnicode_AS_UNICODE(*v); 7225 } 7226 else { 7227 /* Extend unicode object */ 7228 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7229 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7230 PyErr_NoMemory(); 7231 goto error; 7232 } 7233 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7234 goto error; 7235 out = PyUnicode_AS_UNICODE(*v) + n; 7236 } 7237 7238 /* Decode the byte string character per character */ 7239 while (in < endin) 7240 { 7241 /* Decode a character */ 7242 insize = 1; 7243 do 7244 { 7245 outsize = MultiByteToWideChar(code_page, flags, 7246 in, insize, 7247 buffer, Py_ARRAY_LENGTH(buffer)); 7248 if (outsize > 0) 7249 break; 7250 err = GetLastError(); 7251 if (err != ERROR_NO_UNICODE_TRANSLATION 7252 && err != ERROR_INSUFFICIENT_BUFFER) 7253 { 7254 PyErr_SetFromWindowsErr(0); 7255 goto error; 7256 } 7257 insize++; 7258 } 7259 /* 4=maximum length of a UTF-8 sequence */ 7260 while (insize <= 4 && (in + insize) <= endin); 7261 7262 if (outsize <= 0) { 7263 Py_ssize_t startinpos, endinpos, outpos; 7264 7265 /* last character in partial decode? */ 7266 if (in + insize >= endin && !final) 7267 break; 7268 7269 startinpos = in - startin; 7270 endinpos = startinpos + 1; 7271 outpos = out - PyUnicode_AS_UNICODE(*v); 7272 if (unicode_decode_call_errorhandler_wchar( 7273 errors, &errorHandler, 7274 encoding, reason, 7275 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7276 v, &outpos)) 7277 { 7278 goto error; 7279 } 7280 out = PyUnicode_AS_UNICODE(*v) + outpos; 7281 } 7282 else { 7283 in += insize; 7284 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7285 out += outsize; 7286 } 7287 } 7288 7289 /* write a NUL character at the end */ 7290 *out = 0; 7291 7292 /* Extend unicode object */ 7293 outsize = out - PyUnicode_AS_UNICODE(*v); 7294 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7295 if (unicode_resize(v, outsize) < 0) 7296 goto error; 7297 /* (in - startin) <= size and size is an int */ 7298 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7299 7300 error: 7301 Py_XDECREF(encoding_obj); 7302 Py_XDECREF(errorHandler); 7303 Py_XDECREF(exc); 7304 return ret; 7305 } 7306 7307 static PyObject * 7308 decode_code_page_stateful(int code_page, 7309 const char *s, Py_ssize_t size, 7310 const char *errors, Py_ssize_t *consumed) 7311 { 7312 PyObject *v = NULL; 7313 int chunk_size, final, converted, done; 7314 7315 if (code_page < 0) { 7316 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7317 return NULL; 7318 } 7319 if (size < 0) { 7320 PyErr_BadInternalCall(); 7321 return NULL; 7322 } 7323 7324 if (consumed) 7325 *consumed = 0; 7326 7327 do 7328 { 7329 #ifdef NEED_RETRY 7330 if (size > INT_MAX) { 7331 chunk_size = INT_MAX; 7332 final = 0; 7333 done = 0; 7334 } 7335 else 7336 #endif 7337 { 7338 chunk_size = (int)size; 7339 final = (consumed == NULL); 7340 done = 1; 7341 } 7342 7343 if (chunk_size == 0 && done) { 7344 if (v != NULL) 7345 break; 7346 _Py_RETURN_UNICODE_EMPTY(); 7347 } 7348 7349 converted = decode_code_page_strict(code_page, &v, 7350 s, chunk_size); 7351 if (converted == -2) 7352 converted = decode_code_page_errors(code_page, &v, 7353 s, chunk_size, 7354 errors, final); 7355 assert(converted != 0 || done); 7356 7357 if (converted < 0) { 7358 Py_XDECREF(v); 7359 return NULL; 7360 } 7361 7362 if (consumed) 7363 *consumed += converted; 7364 7365 s += converted; 7366 size -= converted; 7367 } while (!done); 7368 7369 return unicode_result(v); 7370 } 7371 7372 PyObject * 7373 PyUnicode_DecodeCodePageStateful(int code_page, 7374 const char *s, 7375 Py_ssize_t size, 7376 const char *errors, 7377 Py_ssize_t *consumed) 7378 { 7379 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7380 } 7381 7382 PyObject * 7383 PyUnicode_DecodeMBCSStateful(const char *s, 7384 Py_ssize_t size, 7385 const char *errors, 7386 Py_ssize_t *consumed) 7387 { 7388 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7389 } 7390 7391 PyObject * 7392 PyUnicode_DecodeMBCS(const char *s, 7393 Py_ssize_t size, 7394 const char *errors) 7395 { 7396 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7397 } 7398 7399 static DWORD 7400 encode_code_page_flags(UINT code_page, const char *errors) 7401 { 7402 if (code_page == CP_UTF8) { 7403 return WC_ERR_INVALID_CHARS; 7404 } 7405 else if (code_page == CP_UTF7) { 7406 /* CP_UTF7 only supports flags=0 */ 7407 return 0; 7408 } 7409 else { 7410 if (errors != NULL && strcmp(errors, "replace") == 0) 7411 return 0; 7412 else 7413 return WC_NO_BEST_FIT_CHARS; 7414 } 7415 } 7416 7417 /* 7418 * Encode a Unicode string to a Windows code page into a byte string in strict 7419 * mode. 7420 * 7421 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7422 * an OSError and returns -1 on other error. 7423 */ 7424 static int 7425 encode_code_page_strict(UINT code_page, PyObject **outbytes, 7426 PyObject *unicode, Py_ssize_t offset, int len, 7427 const char* errors) 7428 { 7429 BOOL usedDefaultChar = FALSE; 7430 BOOL *pusedDefaultChar = &usedDefaultChar; 7431 int outsize; 7432 wchar_t *p; 7433 Py_ssize_t size; 7434 const DWORD flags = encode_code_page_flags(code_page, NULL); 7435 char *out; 7436 /* Create a substring so that we can get the UTF-16 representation 7437 of just the slice under consideration. */ 7438 PyObject *substring; 7439 7440 assert(len > 0); 7441 7442 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7443 pusedDefaultChar = &usedDefaultChar; 7444 else 7445 pusedDefaultChar = NULL; 7446 7447 substring = PyUnicode_Substring(unicode, offset, offset+len); 7448 if (substring == NULL) 7449 return -1; 7450 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7451 if (p == NULL) { 7452 Py_DECREF(substring); 7453 return -1; 7454 } 7455 assert(size <= INT_MAX); 7456 7457 /* First get the size of the result */ 7458 outsize = WideCharToMultiByte(code_page, flags, 7459 p, (int)size, 7460 NULL, 0, 7461 NULL, pusedDefaultChar); 7462 if (outsize <= 0) 7463 goto error; 7464 /* If we used a default char, then we failed! */ 7465 if (pusedDefaultChar && *pusedDefaultChar) { 7466 Py_DECREF(substring); 7467 return -2; 7468 } 7469 7470 if (*outbytes == NULL) { 7471 /* Create string object */ 7472 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7473 if (*outbytes == NULL) { 7474 Py_DECREF(substring); 7475 return -1; 7476 } 7477 out = PyBytes_AS_STRING(*outbytes); 7478 } 7479 else { 7480 /* Extend string object */ 7481 const Py_ssize_t n = PyBytes_Size(*outbytes); 7482 if (outsize > PY_SSIZE_T_MAX - n) { 7483 PyErr_NoMemory(); 7484 Py_DECREF(substring); 7485 return -1; 7486 } 7487 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7488 Py_DECREF(substring); 7489 return -1; 7490 } 7491 out = PyBytes_AS_STRING(*outbytes) + n; 7492 } 7493 7494 /* Do the conversion */ 7495 outsize = WideCharToMultiByte(code_page, flags, 7496 p, (int)size, 7497 out, outsize, 7498 NULL, pusedDefaultChar); 7499 Py_CLEAR(substring); 7500 if (outsize <= 0) 7501 goto error; 7502 if (pusedDefaultChar && *pusedDefaultChar) 7503 return -2; 7504 return 0; 7505 7506 error: 7507 Py_XDECREF(substring); 7508 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7509 return -2; 7510 PyErr_SetFromWindowsErr(0); 7511 return -1; 7512 } 7513 7514 /* 7515 * Encode a Unicode string to a Windows code page into a byte string using an 7516 * error handler. 7517 * 7518 * Returns consumed characters if succeed, or raise an OSError and returns 7519 * -1 on other error. 7520 */ 7521 static int 7522 encode_code_page_errors(UINT code_page, PyObject **outbytes, 7523 PyObject *unicode, Py_ssize_t unicode_offset, 7524 Py_ssize_t insize, const char* errors) 7525 { 7526 const DWORD flags = encode_code_page_flags(code_page, errors); 7527 Py_ssize_t pos = unicode_offset; 7528 Py_ssize_t endin = unicode_offset + insize; 7529 /* Ideally, we should get reason from FormatMessage. This is the Windows 7530 2000 English version of the message. */ 7531 const char *reason = "invalid character"; 7532 /* 4=maximum length of a UTF-8 sequence */ 7533 char buffer[4]; 7534 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7535 Py_ssize_t outsize; 7536 char *out; 7537 PyObject *errorHandler = NULL; 7538 PyObject *exc = NULL; 7539 PyObject *encoding_obj = NULL; 7540 const char *encoding; 7541 Py_ssize_t newpos, newoutsize; 7542 PyObject *rep; 7543 int ret = -1; 7544 7545 assert(insize > 0); 7546 7547 encoding = code_page_name(code_page, &encoding_obj); 7548 if (encoding == NULL) 7549 return -1; 7550 7551 if (errors == NULL || strcmp(errors, "strict") == 0) { 7552 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7553 then we raise a UnicodeEncodeError. */ 7554 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7555 if (exc != NULL) { 7556 PyCodec_StrictErrors(exc); 7557 Py_DECREF(exc); 7558 } 7559 Py_XDECREF(encoding_obj); 7560 return -1; 7561 } 7562 7563 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7564 pusedDefaultChar = &usedDefaultChar; 7565 else 7566 pusedDefaultChar = NULL; 7567 7568 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7569 PyErr_NoMemory(); 7570 goto error; 7571 } 7572 outsize = insize * Py_ARRAY_LENGTH(buffer); 7573 7574 if (*outbytes == NULL) { 7575 /* Create string object */ 7576 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7577 if (*outbytes == NULL) 7578 goto error; 7579 out = PyBytes_AS_STRING(*outbytes); 7580 } 7581 else { 7582 /* Extend string object */ 7583 Py_ssize_t n = PyBytes_Size(*outbytes); 7584 if (n > PY_SSIZE_T_MAX - outsize) { 7585 PyErr_NoMemory(); 7586 goto error; 7587 } 7588 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7589 goto error; 7590 out = PyBytes_AS_STRING(*outbytes) + n; 7591 } 7592 7593 /* Encode the string character per character */ 7594 while (pos < endin) 7595 { 7596 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7597 wchar_t chars[2]; 7598 int charsize; 7599 if (ch < 0x10000) { 7600 chars[0] = (wchar_t)ch; 7601 charsize = 1; 7602 } 7603 else { 7604 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7605 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7606 charsize = 2; 7607 } 7608 7609 outsize = WideCharToMultiByte(code_page, flags, 7610 chars, charsize, 7611 buffer, Py_ARRAY_LENGTH(buffer), 7612 NULL, pusedDefaultChar); 7613 if (outsize > 0) { 7614 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7615 { 7616 pos++; 7617 memcpy(out, buffer, outsize); 7618 out += outsize; 7619 continue; 7620 } 7621 } 7622 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7623 PyErr_SetFromWindowsErr(0); 7624 goto error; 7625 } 7626 7627 rep = unicode_encode_call_errorhandler( 7628 errors, &errorHandler, encoding, reason, 7629 unicode, &exc, 7630 pos, pos + 1, &newpos); 7631 if (rep == NULL) 7632 goto error; 7633 pos = newpos; 7634 7635 if (PyBytes_Check(rep)) { 7636 outsize = PyBytes_GET_SIZE(rep); 7637 if (outsize != 1) { 7638 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7639 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7640 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7641 Py_DECREF(rep); 7642 goto error; 7643 } 7644 out = PyBytes_AS_STRING(*outbytes) + offset; 7645 } 7646 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7647 out += outsize; 7648 } 7649 else { 7650 Py_ssize_t i; 7651 enum PyUnicode_Kind kind; 7652 void *data; 7653 7654 if (PyUnicode_READY(rep) == -1) { 7655 Py_DECREF(rep); 7656 goto error; 7657 } 7658 7659 outsize = PyUnicode_GET_LENGTH(rep); 7660 if (outsize != 1) { 7661 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7662 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7663 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7664 Py_DECREF(rep); 7665 goto error; 7666 } 7667 out = PyBytes_AS_STRING(*outbytes) + offset; 7668 } 7669 kind = PyUnicode_KIND(rep); 7670 data = PyUnicode_DATA(rep); 7671 for (i=0; i < outsize; i++) { 7672 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7673 if (ch > 127) { 7674 raise_encode_exception(&exc, 7675 encoding, unicode, 7676 pos, pos + 1, 7677 "unable to encode error handler result to ASCII"); 7678 Py_DECREF(rep); 7679 goto error; 7680 } 7681 *out = (unsigned char)ch; 7682 out++; 7683 } 7684 } 7685 Py_DECREF(rep); 7686 } 7687 /* write a NUL byte */ 7688 *out = 0; 7689 outsize = out - PyBytes_AS_STRING(*outbytes); 7690 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7691 if (_PyBytes_Resize(outbytes, outsize) < 0) 7692 goto error; 7693 ret = 0; 7694 7695 error: 7696 Py_XDECREF(encoding_obj); 7697 Py_XDECREF(errorHandler); 7698 Py_XDECREF(exc); 7699 return ret; 7700 } 7701 7702 static PyObject * 7703 encode_code_page(int code_page, 7704 PyObject *unicode, 7705 const char *errors) 7706 { 7707 Py_ssize_t len; 7708 PyObject *outbytes = NULL; 7709 Py_ssize_t offset; 7710 int chunk_len, ret, done; 7711 7712 if (!PyUnicode_Check(unicode)) { 7713 PyErr_BadArgument(); 7714 return NULL; 7715 } 7716 7717 if (PyUnicode_READY(unicode) == -1) 7718 return NULL; 7719 len = PyUnicode_GET_LENGTH(unicode); 7720 7721 if (code_page < 0) { 7722 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7723 return NULL; 7724 } 7725 7726 if (len == 0) 7727 return PyBytes_FromStringAndSize(NULL, 0); 7728 7729 offset = 0; 7730 do 7731 { 7732 #ifdef NEED_RETRY 7733 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7734 chunks. */ 7735 if (len > INT_MAX/2) { 7736 chunk_len = INT_MAX/2; 7737 done = 0; 7738 } 7739 else 7740 #endif 7741 { 7742 chunk_len = (int)len; 7743 done = 1; 7744 } 7745 7746 ret = encode_code_page_strict(code_page, &outbytes, 7747 unicode, offset, chunk_len, 7748 errors); 7749 if (ret == -2) 7750 ret = encode_code_page_errors(code_page, &outbytes, 7751 unicode, offset, 7752 chunk_len, errors); 7753 if (ret < 0) { 7754 Py_XDECREF(outbytes); 7755 return NULL; 7756 } 7757 7758 offset += chunk_len; 7759 len -= chunk_len; 7760 } while (!done); 7761 7762 return outbytes; 7763 } 7764 7765 PyObject * 7766 PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7767 Py_ssize_t size, 7768 const char *errors) 7769 { 7770 PyObject *unicode, *res; 7771 unicode = PyUnicode_FromWideChar(p, size); 7772 if (unicode == NULL) 7773 return NULL; 7774 res = encode_code_page(CP_ACP, unicode, errors); 7775 Py_DECREF(unicode); 7776 return res; 7777 } 7778 7779 PyObject * 7780 PyUnicode_EncodeCodePage(int code_page, 7781 PyObject *unicode, 7782 const char *errors) 7783 { 7784 return encode_code_page(code_page, unicode, errors); 7785 } 7786 7787 PyObject * 7788 PyUnicode_AsMBCSString(PyObject *unicode) 7789 { 7790 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7791 } 7792 7793 #undef NEED_RETRY 7794 7795 #endif /* MS_WINDOWS */ 7796 7797 /* --- Character Mapping Codec -------------------------------------------- */ 7798 7799 static int 7800 charmap_decode_string(const char *s, 7801 Py_ssize_t size, 7802 PyObject *mapping, 7803 const char *errors, 7804 _PyUnicodeWriter *writer) 7805 { 7806 const char *starts = s; 7807 const char *e; 7808 Py_ssize_t startinpos, endinpos; 7809 PyObject *errorHandler = NULL, *exc = NULL; 7810 Py_ssize_t maplen; 7811 enum PyUnicode_Kind mapkind; 7812 void *mapdata; 7813 Py_UCS4 x; 7814 unsigned char ch; 7815 7816 if (PyUnicode_READY(mapping) == -1) 7817 return -1; 7818 7819 maplen = PyUnicode_GET_LENGTH(mapping); 7820 mapdata = PyUnicode_DATA(mapping); 7821 mapkind = PyUnicode_KIND(mapping); 7822 7823 e = s + size; 7824 7825 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7826 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7827 * is disabled in encoding aliases, latin1 is preferred because 7828 * its implementation is faster. */ 7829 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7830 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7831 Py_UCS4 maxchar = writer->maxchar; 7832 7833 assert (writer->kind == PyUnicode_1BYTE_KIND); 7834 while (s < e) { 7835 ch = *s; 7836 x = mapdata_ucs1[ch]; 7837 if (x > maxchar) { 7838 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7839 goto onError; 7840 maxchar = writer->maxchar; 7841 outdata = (Py_UCS1 *)writer->data; 7842 } 7843 outdata[writer->pos] = x; 7844 writer->pos++; 7845 ++s; 7846 } 7847 return 0; 7848 } 7849 7850 while (s < e) { 7851 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7852 enum PyUnicode_Kind outkind = writer->kind; 7853 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7854 if (outkind == PyUnicode_1BYTE_KIND) { 7855 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7856 Py_UCS4 maxchar = writer->maxchar; 7857 while (s < e) { 7858 ch = *s; 7859 x = mapdata_ucs2[ch]; 7860 if (x > maxchar) 7861 goto Error; 7862 outdata[writer->pos] = x; 7863 writer->pos++; 7864 ++s; 7865 } 7866 break; 7867 } 7868 else if (outkind == PyUnicode_2BYTE_KIND) { 7869 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7870 while (s < e) { 7871 ch = *s; 7872 x = mapdata_ucs2[ch]; 7873 if (x == 0xFFFE) 7874 goto Error; 7875 outdata[writer->pos] = x; 7876 writer->pos++; 7877 ++s; 7878 } 7879 break; 7880 } 7881 } 7882 ch = *s; 7883 7884 if (ch < maplen) 7885 x = PyUnicode_READ(mapkind, mapdata, ch); 7886 else 7887 x = 0xfffe; /* invalid value */ 7888 Error: 7889 if (x == 0xfffe) 7890 { 7891 /* undefined mapping */ 7892 startinpos = s-starts; 7893 endinpos = startinpos+1; 7894 if (unicode_decode_call_errorhandler_writer( 7895 errors, &errorHandler, 7896 "charmap", "character maps to <undefined>", 7897 &starts, &e, &startinpos, &endinpos, &exc, &s, 7898 writer)) { 7899 goto onError; 7900 } 7901 continue; 7902 } 7903 7904 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7905 goto onError; 7906 ++s; 7907 } 7908 Py_XDECREF(errorHandler); 7909 Py_XDECREF(exc); 7910 return 0; 7911 7912 onError: 7913 Py_XDECREF(errorHandler); 7914 Py_XDECREF(exc); 7915 return -1; 7916 } 7917 7918 static int 7919 charmap_decode_mapping(const char *s, 7920 Py_ssize_t size, 7921 PyObject *mapping, 7922 const char *errors, 7923 _PyUnicodeWriter *writer) 7924 { 7925 const char *starts = s; 7926 const char *e; 7927 Py_ssize_t startinpos, endinpos; 7928 PyObject *errorHandler = NULL, *exc = NULL; 7929 unsigned char ch; 7930 PyObject *key, *item = NULL; 7931 7932 e = s + size; 7933 7934 while (s < e) { 7935 ch = *s; 7936 7937 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7938 key = PyLong_FromLong((long)ch); 7939 if (key == NULL) 7940 goto onError; 7941 7942 item = PyObject_GetItem(mapping, key); 7943 Py_DECREF(key); 7944 if (item == NULL) { 7945 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7946 /* No mapping found means: mapping is undefined. */ 7947 PyErr_Clear(); 7948 goto Undefined; 7949 } else 7950 goto onError; 7951 } 7952 7953 /* Apply mapping */ 7954 if (item == Py_None) 7955 goto Undefined; 7956 if (PyLong_Check(item)) { 7957 long value = PyLong_AS_LONG(item); 7958 if (value == 0xFFFE) 7959 goto Undefined; 7960 if (value < 0 || value > MAX_UNICODE) { 7961 PyErr_Format(PyExc_TypeError, 7962 "character mapping must be in range(0x%lx)", 7963 (unsigned long)MAX_UNICODE + 1); 7964 goto onError; 7965 } 7966 7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7968 goto onError; 7969 } 7970 else if (PyUnicode_Check(item)) { 7971 if (PyUnicode_READY(item) == -1) 7972 goto onError; 7973 if (PyUnicode_GET_LENGTH(item) == 1) { 7974 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7975 if (value == 0xFFFE) 7976 goto Undefined; 7977 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7978 goto onError; 7979 } 7980 else { 7981 writer->overallocate = 1; 7982 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7983 goto onError; 7984 } 7985 } 7986 else { 7987 /* wrong return value */ 7988 PyErr_SetString(PyExc_TypeError, 7989 "character mapping must return integer, None or str"); 7990 goto onError; 7991 } 7992 Py_CLEAR(item); 7993 ++s; 7994 continue; 7995 7996 Undefined: 7997 /* undefined mapping */ 7998 Py_CLEAR(item); 7999 startinpos = s-starts; 8000 endinpos = startinpos+1; 8001 if (unicode_decode_call_errorhandler_writer( 8002 errors, &errorHandler, 8003 "charmap", "character maps to <undefined>", 8004 &starts, &e, &startinpos, &endinpos, &exc, &s, 8005 writer)) { 8006 goto onError; 8007 } 8008 } 8009 Py_XDECREF(errorHandler); 8010 Py_XDECREF(exc); 8011 return 0; 8012 8013 onError: 8014 Py_XDECREF(item); 8015 Py_XDECREF(errorHandler); 8016 Py_XDECREF(exc); 8017 return -1; 8018 } 8019 8020 PyObject * 8021 PyUnicode_DecodeCharmap(const char *s, 8022 Py_ssize_t size, 8023 PyObject *mapping, 8024 const char *errors) 8025 { 8026 _PyUnicodeWriter writer; 8027 8028 /* Default to Latin-1 */ 8029 if (mapping == NULL) 8030 return PyUnicode_DecodeLatin1(s, size, errors); 8031 8032 if (size == 0) 8033 _Py_RETURN_UNICODE_EMPTY(); 8034 _PyUnicodeWriter_Init(&writer); 8035 writer.min_length = size; 8036 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 8037 goto onError; 8038 8039 if (PyUnicode_CheckExact(mapping)) { 8040 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 8041 goto onError; 8042 } 8043 else { 8044 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 8045 goto onError; 8046 } 8047 return _PyUnicodeWriter_Finish(&writer); 8048 8049 onError: 8050 _PyUnicodeWriter_Dealloc(&writer); 8051 return NULL; 8052 } 8053 8054 /* Charmap encoding: the lookup table */ 8055 8056 struct encoding_map { 8057 PyObject_HEAD 8058 unsigned char level1[32]; 8059 int count2, count3; 8060 unsigned char level23[1]; 8061 }; 8062 8063 static PyObject* 8064 encoding_map_size(PyObject *obj, PyObject* args) 8065 { 8066 struct encoding_map *map = (struct encoding_map*)obj; 8067 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 8068 128*map->count3); 8069 } 8070 8071 static PyMethodDef encoding_map_methods[] = { 8072 {"size", encoding_map_size, METH_NOARGS, 8073 PyDoc_STR("Return the size (in bytes) of this object") }, 8074 { 0 } 8075 }; 8076 8077 static void 8078 encoding_map_dealloc(PyObject* o) 8079 { 8080 PyObject_FREE(o); 8081 } 8082 8083 static PyTypeObject EncodingMapType = { 8084 PyVarObject_HEAD_INIT(NULL, 0) 8085 "EncodingMap", /*tp_name*/ 8086 sizeof(struct encoding_map), /*tp_basicsize*/ 8087 0, /*tp_itemsize*/ 8088 /* methods */ 8089 encoding_map_dealloc, /*tp_dealloc*/ 8090 0, /*tp_print*/ 8091 0, /*tp_getattr*/ 8092 0, /*tp_setattr*/ 8093 0, /*tp_reserved*/ 8094 0, /*tp_repr*/ 8095 0, /*tp_as_number*/ 8096 0, /*tp_as_sequence*/ 8097 0, /*tp_as_mapping*/ 8098 0, /*tp_hash*/ 8099 0, /*tp_call*/ 8100 0, /*tp_str*/ 8101 0, /*tp_getattro*/ 8102 0, /*tp_setattro*/ 8103 0, /*tp_as_buffer*/ 8104 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 8105 0, /*tp_doc*/ 8106 0, /*tp_traverse*/ 8107 0, /*tp_clear*/ 8108 0, /*tp_richcompare*/ 8109 0, /*tp_weaklistoffset*/ 8110 0, /*tp_iter*/ 8111 0, /*tp_iternext*/ 8112 encoding_map_methods, /*tp_methods*/ 8113 0, /*tp_members*/ 8114 0, /*tp_getset*/ 8115 0, /*tp_base*/ 8116 0, /*tp_dict*/ 8117 0, /*tp_descr_get*/ 8118 0, /*tp_descr_set*/ 8119 0, /*tp_dictoffset*/ 8120 0, /*tp_init*/ 8121 0, /*tp_alloc*/ 8122 0, /*tp_new*/ 8123 0, /*tp_free*/ 8124 0, /*tp_is_gc*/ 8125 }; 8126 8127 PyObject* 8128 PyUnicode_BuildEncodingMap(PyObject* string) 8129 { 8130 PyObject *result; 8131 struct encoding_map *mresult; 8132 int i; 8133 int need_dict = 0; 8134 unsigned char level1[32]; 8135 unsigned char level2[512]; 8136 unsigned char *mlevel1, *mlevel2, *mlevel3; 8137 int count2 = 0, count3 = 0; 8138 int kind; 8139 void *data; 8140 Py_ssize_t length; 8141 Py_UCS4 ch; 8142 8143 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 8144 PyErr_BadArgument(); 8145 return NULL; 8146 } 8147 kind = PyUnicode_KIND(string); 8148 data = PyUnicode_DATA(string); 8149 length = PyUnicode_GET_LENGTH(string); 8150 length = Py_MIN(length, 256); 8151 memset(level1, 0xFF, sizeof level1); 8152 memset(level2, 0xFF, sizeof level2); 8153 8154 /* If there isn't a one-to-one mapping of NULL to \0, 8155 or if there are non-BMP characters, we need to use 8156 a mapping dictionary. */ 8157 if (PyUnicode_READ(kind, data, 0) != 0) 8158 need_dict = 1; 8159 for (i = 1; i < length; i++) { 8160 int l1, l2; 8161 ch = PyUnicode_READ(kind, data, i); 8162 if (ch == 0 || ch > 0xFFFF) { 8163 need_dict = 1; 8164 break; 8165 } 8166 if (ch == 0xFFFE) 8167 /* unmapped character */ 8168 continue; 8169 l1 = ch >> 11; 8170 l2 = ch >> 7; 8171 if (level1[l1] == 0xFF) 8172 level1[l1] = count2++; 8173 if (level2[l2] == 0xFF) 8174 level2[l2] = count3++; 8175 } 8176 8177 if (count2 >= 0xFF || count3 >= 0xFF) 8178 need_dict = 1; 8179 8180 if (need_dict) { 8181 PyObject *result = PyDict_New(); 8182 PyObject *key, *value; 8183 if (!result) 8184 return NULL; 8185 for (i = 0; i < length; i++) { 8186 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 8187 value = PyLong_FromLong(i); 8188 if (!key || !value) 8189 goto failed1; 8190 if (PyDict_SetItem(result, key, value) == -1) 8191 goto failed1; 8192 Py_DECREF(key); 8193 Py_DECREF(value); 8194 } 8195 return result; 8196 failed1: 8197 Py_XDECREF(key); 8198 Py_XDECREF(value); 8199 Py_DECREF(result); 8200 return NULL; 8201 } 8202 8203 /* Create a three-level trie */ 8204 result = PyObject_MALLOC(sizeof(struct encoding_map) + 8205 16*count2 + 128*count3 - 1); 8206 if (!result) 8207 return PyErr_NoMemory(); 8208 PyObject_Init(result, &EncodingMapType); 8209 mresult = (struct encoding_map*)result; 8210 mresult->count2 = count2; 8211 mresult->count3 = count3; 8212 mlevel1 = mresult->level1; 8213 mlevel2 = mresult->level23; 8214 mlevel3 = mresult->level23 + 16*count2; 8215 memcpy(mlevel1, level1, 32); 8216 memset(mlevel2, 0xFF, 16*count2); 8217 memset(mlevel3, 0, 128*count3); 8218 count3 = 0; 8219 for (i = 1; i < length; i++) { 8220 int o1, o2, o3, i2, i3; 8221 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8222 if (ch == 0xFFFE) 8223 /* unmapped character */ 8224 continue; 8225 o1 = ch>>11; 8226 o2 = (ch>>7) & 0xF; 8227 i2 = 16*mlevel1[o1] + o2; 8228 if (mlevel2[i2] == 0xFF) 8229 mlevel2[i2] = count3++; 8230 o3 = ch & 0x7F; 8231 i3 = 128*mlevel2[i2] + o3; 8232 mlevel3[i3] = i; 8233 } 8234 return result; 8235 } 8236 8237 static int 8238 encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 8239 { 8240 struct encoding_map *map = (struct encoding_map*)mapping; 8241 int l1 = c>>11; 8242 int l2 = (c>>7) & 0xF; 8243 int l3 = c & 0x7F; 8244 int i; 8245 8246 if (c > 0xFFFF) 8247 return -1; 8248 if (c == 0) 8249 return 0; 8250 /* level 1*/ 8251 i = map->level1[l1]; 8252 if (i == 0xFF) { 8253 return -1; 8254 } 8255 /* level 2*/ 8256 i = map->level23[16*i+l2]; 8257 if (i == 0xFF) { 8258 return -1; 8259 } 8260 /* level 3 */ 8261 i = map->level23[16*map->count2 + 128*i + l3]; 8262 if (i == 0) { 8263 return -1; 8264 } 8265 return i; 8266 } 8267 8268 /* Lookup the character ch in the mapping. If the character 8269 can't be found, Py_None is returned (or NULL, if another 8270 error occurred). */ 8271 static PyObject * 8272 charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8273 { 8274 PyObject *w = PyLong_FromLong((long)c); 8275 PyObject *x; 8276 8277 if (w == NULL) 8278 return NULL; 8279 x = PyObject_GetItem(mapping, w); 8280 Py_DECREF(w); 8281 if (x == NULL) { 8282 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8283 /* No mapping found means: mapping is undefined. */ 8284 PyErr_Clear(); 8285 Py_RETURN_NONE; 8286 } else 8287 return NULL; 8288 } 8289 else if (x == Py_None) 8290 return x; 8291 else if (PyLong_Check(x)) { 8292 long value = PyLong_AS_LONG(x); 8293 if (value < 0 || value > 255) { 8294 PyErr_SetString(PyExc_TypeError, 8295 "character mapping must be in range(256)"); 8296 Py_DECREF(x); 8297 return NULL; 8298 } 8299 return x; 8300 } 8301 else if (PyBytes_Check(x)) 8302 return x; 8303 else { 8304 /* wrong return value */ 8305 PyErr_Format(PyExc_TypeError, 8306 "character mapping must return integer, bytes or None, not %.400s", 8307 x->ob_type->tp_name); 8308 Py_DECREF(x); 8309 return NULL; 8310 } 8311 } 8312 8313 static int 8314 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8315 { 8316 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8317 /* exponentially overallocate to minimize reallocations */ 8318 if (requiredsize < 2*outsize) 8319 requiredsize = 2*outsize; 8320 if (_PyBytes_Resize(outobj, requiredsize)) 8321 return -1; 8322 return 0; 8323 } 8324 8325 typedef enum charmapencode_result { 8326 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8327 } charmapencode_result; 8328 /* lookup the character, put the result in the output string and adjust 8329 various state variables. Resize the output bytes object if not enough 8330 space is available. Return a new reference to the object that 8331 was put in the output buffer, or Py_None, if the mapping was undefined 8332 (in which case no character was written) or NULL, if a 8333 reallocation error occurred. The caller must decref the result */ 8334 static charmapencode_result 8335 charmapencode_output(Py_UCS4 c, PyObject *mapping, 8336 PyObject **outobj, Py_ssize_t *outpos) 8337 { 8338 PyObject *rep; 8339 char *outstart; 8340 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8341 8342 if (Py_TYPE(mapping) == &EncodingMapType) { 8343 int res = encoding_map_lookup(c, mapping); 8344 Py_ssize_t requiredsize = *outpos+1; 8345 if (res == -1) 8346 return enc_FAILED; 8347 if (outsize<requiredsize) 8348 if (charmapencode_resize(outobj, outpos, requiredsize)) 8349 return enc_EXCEPTION; 8350 outstart = PyBytes_AS_STRING(*outobj); 8351 outstart[(*outpos)++] = (char)res; 8352 return enc_SUCCESS; 8353 } 8354 8355 rep = charmapencode_lookup(c, mapping); 8356 if (rep==NULL) 8357 return enc_EXCEPTION; 8358 else if (rep==Py_None) { 8359 Py_DECREF(rep); 8360 return enc_FAILED; 8361 } else { 8362 if (PyLong_Check(rep)) { 8363 Py_ssize_t requiredsize = *outpos+1; 8364 if (outsize<requiredsize) 8365 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8366 Py_DECREF(rep); 8367 return enc_EXCEPTION; 8368 } 8369 outstart = PyBytes_AS_STRING(*outobj); 8370 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8371 } 8372 else { 8373 const char *repchars = PyBytes_AS_STRING(rep); 8374 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8375 Py_ssize_t requiredsize = *outpos+repsize; 8376 if (outsize<requiredsize) 8377 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8378 Py_DECREF(rep); 8379 return enc_EXCEPTION; 8380 } 8381 outstart = PyBytes_AS_STRING(*outobj); 8382 memcpy(outstart + *outpos, repchars, repsize); 8383 *outpos += repsize; 8384 } 8385 } 8386 Py_DECREF(rep); 8387 return enc_SUCCESS; 8388 } 8389 8390 /* handle an error in PyUnicode_EncodeCharmap 8391 Return 0 on success, -1 on error */ 8392 static int 8393 charmap_encoding_error( 8394 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8395 PyObject **exceptionObject, 8396 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8397 PyObject **res, Py_ssize_t *respos) 8398 { 8399 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8400 Py_ssize_t size, repsize; 8401 Py_ssize_t newpos; 8402 enum PyUnicode_Kind kind; 8403 void *data; 8404 Py_ssize_t index; 8405 /* startpos for collecting unencodable chars */ 8406 Py_ssize_t collstartpos = *inpos; 8407 Py_ssize_t collendpos = *inpos+1; 8408 Py_ssize_t collpos; 8409 const char *encoding = "charmap"; 8410 const char *reason = "character maps to <undefined>"; 8411 charmapencode_result x; 8412 Py_UCS4 ch; 8413 int val; 8414 8415 if (PyUnicode_READY(unicode) == -1) 8416 return -1; 8417 size = PyUnicode_GET_LENGTH(unicode); 8418 /* find all unencodable characters */ 8419 while (collendpos < size) { 8420 PyObject *rep; 8421 if (Py_TYPE(mapping) == &EncodingMapType) { 8422 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8423 val = encoding_map_lookup(ch, mapping); 8424 if (val != -1) 8425 break; 8426 ++collendpos; 8427 continue; 8428 } 8429 8430 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8431 rep = charmapencode_lookup(ch, mapping); 8432 if (rep==NULL) 8433 return -1; 8434 else if (rep!=Py_None) { 8435 Py_DECREF(rep); 8436 break; 8437 } 8438 Py_DECREF(rep); 8439 ++collendpos; 8440 } 8441 /* cache callback name lookup 8442 * (if not done yet, i.e. it's the first error) */ 8443 if (*error_handler == _Py_ERROR_UNKNOWN) 8444 *error_handler = get_error_handler(errors); 8445 8446 switch (*error_handler) { 8447 case _Py_ERROR_STRICT: 8448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8449 return -1; 8450 8451 case _Py_ERROR_REPLACE: 8452 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8453 x = charmapencode_output('?', mapping, res, respos); 8454 if (x==enc_EXCEPTION) { 8455 return -1; 8456 } 8457 else if (x==enc_FAILED) { 8458 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8459 return -1; 8460 } 8461 } 8462 /* fall through */ 8463 case _Py_ERROR_IGNORE: 8464 *inpos = collendpos; 8465 break; 8466 8467 case _Py_ERROR_XMLCHARREFREPLACE: 8468 /* generate replacement (temporarily (mis)uses p) */ 8469 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8470 char buffer[2+29+1+1]; 8471 char *cp; 8472 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8473 for (cp = buffer; *cp; ++cp) { 8474 x = charmapencode_output(*cp, mapping, res, respos); 8475 if (x==enc_EXCEPTION) 8476 return -1; 8477 else if (x==enc_FAILED) { 8478 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8479 return -1; 8480 } 8481 } 8482 } 8483 *inpos = collendpos; 8484 break; 8485 8486 default: 8487 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8488 encoding, reason, unicode, exceptionObject, 8489 collstartpos, collendpos, &newpos); 8490 if (repunicode == NULL) 8491 return -1; 8492 if (PyBytes_Check(repunicode)) { 8493 /* Directly copy bytes result to output. */ 8494 Py_ssize_t outsize = PyBytes_Size(*res); 8495 Py_ssize_t requiredsize; 8496 repsize = PyBytes_Size(repunicode); 8497 requiredsize = *respos + repsize; 8498 if (requiredsize > outsize) 8499 /* Make room for all additional bytes. */ 8500 if (charmapencode_resize(res, respos, requiredsize)) { 8501 Py_DECREF(repunicode); 8502 return -1; 8503 } 8504 memcpy(PyBytes_AsString(*res) + *respos, 8505 PyBytes_AsString(repunicode), repsize); 8506 *respos += repsize; 8507 *inpos = newpos; 8508 Py_DECREF(repunicode); 8509 break; 8510 } 8511 /* generate replacement */ 8512 if (PyUnicode_READY(repunicode) == -1) { 8513 Py_DECREF(repunicode); 8514 return -1; 8515 } 8516 repsize = PyUnicode_GET_LENGTH(repunicode); 8517 data = PyUnicode_DATA(repunicode); 8518 kind = PyUnicode_KIND(repunicode); 8519 for (index = 0; index < repsize; index++) { 8520 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8521 x = charmapencode_output(repch, mapping, res, respos); 8522 if (x==enc_EXCEPTION) { 8523 Py_DECREF(repunicode); 8524 return -1; 8525 } 8526 else if (x==enc_FAILED) { 8527 Py_DECREF(repunicode); 8528 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8529 return -1; 8530 } 8531 } 8532 *inpos = newpos; 8533 Py_DECREF(repunicode); 8534 } 8535 return 0; 8536 } 8537 8538 PyObject * 8539 _PyUnicode_EncodeCharmap(PyObject *unicode, 8540 PyObject *mapping, 8541 const char *errors) 8542 { 8543 /* output object */ 8544 PyObject *res = NULL; 8545 /* current input position */ 8546 Py_ssize_t inpos = 0; 8547 Py_ssize_t size; 8548 /* current output position */ 8549 Py_ssize_t respos = 0; 8550 PyObject *error_handler_obj = NULL; 8551 PyObject *exc = NULL; 8552 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8553 void *data; 8554 int kind; 8555 8556 if (PyUnicode_READY(unicode) == -1) 8557 return NULL; 8558 size = PyUnicode_GET_LENGTH(unicode); 8559 data = PyUnicode_DATA(unicode); 8560 kind = PyUnicode_KIND(unicode); 8561 8562 /* Default to Latin-1 */ 8563 if (mapping == NULL) 8564 return unicode_encode_ucs1(unicode, errors, 256); 8565 8566 /* allocate enough for a simple encoding without 8567 replacements, if we need more, we'll resize */ 8568 res = PyBytes_FromStringAndSize(NULL, size); 8569 if (res == NULL) 8570 goto onError; 8571 if (size == 0) 8572 return res; 8573 8574 while (inpos<size) { 8575 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8576 /* try to encode it */ 8577 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8578 if (x==enc_EXCEPTION) /* error */ 8579 goto onError; 8580 if (x==enc_FAILED) { /* unencodable character */ 8581 if (charmap_encoding_error(unicode, &inpos, mapping, 8582 &exc, 8583 &error_handler, &error_handler_obj, errors, 8584 &res, &respos)) { 8585 goto onError; 8586 } 8587 } 8588 else 8589 /* done with this character => adjust input position */ 8590 ++inpos; 8591 } 8592 8593 /* Resize if we allocated to much */ 8594 if (respos<PyBytes_GET_SIZE(res)) 8595 if (_PyBytes_Resize(&res, respos) < 0) 8596 goto onError; 8597 8598 Py_XDECREF(exc); 8599 Py_XDECREF(error_handler_obj); 8600 return res; 8601 8602 onError: 8603 Py_XDECREF(res); 8604 Py_XDECREF(exc); 8605 Py_XDECREF(error_handler_obj); 8606 return NULL; 8607 } 8608 8609 /* Deprecated */ 8610 PyObject * 8611 PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8612 Py_ssize_t size, 8613 PyObject *mapping, 8614 const char *errors) 8615 { 8616 PyObject *result; 8617 PyObject *unicode = PyUnicode_FromWideChar(p, size); 8618 if (unicode == NULL) 8619 return NULL; 8620 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8621 Py_DECREF(unicode); 8622 return result; 8623 } 8624 8625 PyObject * 8626 PyUnicode_AsCharmapString(PyObject *unicode, 8627 PyObject *mapping) 8628 { 8629 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8630 PyErr_BadArgument(); 8631 return NULL; 8632 } 8633 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8634 } 8635 8636 /* create or adjust a UnicodeTranslateError */ 8637 static void 8638 make_translate_exception(PyObject **exceptionObject, 8639 PyObject *unicode, 8640 Py_ssize_t startpos, Py_ssize_t endpos, 8641 const char *reason) 8642 { 8643 if (*exceptionObject == NULL) { 8644 *exceptionObject = _PyUnicodeTranslateError_Create( 8645 unicode, startpos, endpos, reason); 8646 } 8647 else { 8648 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8649 goto onError; 8650 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8651 goto onError; 8652 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8653 goto onError; 8654 return; 8655 onError: 8656 Py_CLEAR(*exceptionObject); 8657 } 8658 } 8659 8660 /* error handling callback helper: 8661 build arguments, call the callback and check the arguments, 8662 put the result into newpos and return the replacement string, which 8663 has to be freed by the caller */ 8664 static PyObject * 8665 unicode_translate_call_errorhandler(const char *errors, 8666 PyObject **errorHandler, 8667 const char *reason, 8668 PyObject *unicode, PyObject **exceptionObject, 8669 Py_ssize_t startpos, Py_ssize_t endpos, 8670 Py_ssize_t *newpos) 8671 { 8672 static const char *argparse = "Un;translating error handler must return (str, int) tuple"; 8673 8674 Py_ssize_t i_newpos; 8675 PyObject *restuple; 8676 PyObject *resunicode; 8677 8678 if (*errorHandler == NULL) { 8679 *errorHandler = PyCodec_LookupError(errors); 8680 if (*errorHandler == NULL) 8681 return NULL; 8682 } 8683 8684 make_translate_exception(exceptionObject, 8685 unicode, startpos, endpos, reason); 8686 if (*exceptionObject == NULL) 8687 return NULL; 8688 8689 restuple = PyObject_CallFunctionObjArgs( 8690 *errorHandler, *exceptionObject, NULL); 8691 if (restuple == NULL) 8692 return NULL; 8693 if (!PyTuple_Check(restuple)) { 8694 PyErr_SetString(PyExc_TypeError, &argparse[3]); 8695 Py_DECREF(restuple); 8696 return NULL; 8697 } 8698 if (!PyArg_ParseTuple(restuple, argparse, 8699 &resunicode, &i_newpos)) { 8700 Py_DECREF(restuple); 8701 return NULL; 8702 } 8703 if (i_newpos<0) 8704 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8705 else 8706 *newpos = i_newpos; 8707 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8709 Py_DECREF(restuple); 8710 return NULL; 8711 } 8712 Py_INCREF(resunicode); 8713 Py_DECREF(restuple); 8714 return resunicode; 8715 } 8716 8717 /* Lookup the character ch in the mapping and put the result in result, 8718 which must be decrefed by the caller. 8719 Return 0 on success, -1 on error */ 8720 static int 8721 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8722 { 8723 PyObject *w = PyLong_FromLong((long)c); 8724 PyObject *x; 8725 8726 if (w == NULL) 8727 return -1; 8728 x = PyObject_GetItem(mapping, w); 8729 Py_DECREF(w); 8730 if (x == NULL) { 8731 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8732 /* No mapping found means: use 1:1 mapping. */ 8733 PyErr_Clear(); 8734 *result = NULL; 8735 return 0; 8736 } else 8737 return -1; 8738 } 8739 else if (x == Py_None) { 8740 *result = x; 8741 return 0; 8742 } 8743 else if (PyLong_Check(x)) { 8744 long value = PyLong_AS_LONG(x); 8745 if (value < 0 || value > MAX_UNICODE) { 8746 PyErr_Format(PyExc_ValueError, 8747 "character mapping must be in range(0x%x)", 8748 MAX_UNICODE+1); 8749 Py_DECREF(x); 8750 return -1; 8751 } 8752 *result = x; 8753 return 0; 8754 } 8755 else if (PyUnicode_Check(x)) { 8756 *result = x; 8757 return 0; 8758 } 8759 else { 8760 /* wrong return value */ 8761 PyErr_SetString(PyExc_TypeError, 8762 "character mapping must return integer, None or str"); 8763 Py_DECREF(x); 8764 return -1; 8765 } 8766 } 8767 8768 /* lookup the character, write the result into the writer. 8769 Return 1 if the result was written into the writer, return 0 if the mapping 8770 was undefined, raise an exception return -1 on error. */ 8771 static int 8772 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8773 _PyUnicodeWriter *writer) 8774 { 8775 PyObject *item; 8776 8777 if (charmaptranslate_lookup(ch, mapping, &item)) 8778 return -1; 8779 8780 if (item == NULL) { 8781 /* not found => default to 1:1 mapping */ 8782 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8783 return -1; 8784 } 8785 return 1; 8786 } 8787 8788 if (item == Py_None) { 8789 Py_DECREF(item); 8790 return 0; 8791 } 8792 8793 if (PyLong_Check(item)) { 8794 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8795 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8796 used it */ 8797 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8798 Py_DECREF(item); 8799 return -1; 8800 } 8801 Py_DECREF(item); 8802 return 1; 8803 } 8804 8805 if (!PyUnicode_Check(item)) { 8806 Py_DECREF(item); 8807 return -1; 8808 } 8809 8810 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8811 Py_DECREF(item); 8812 return -1; 8813 } 8814 8815 Py_DECREF(item); 8816 return 1; 8817 } 8818 8819 static int 8820 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8821 Py_UCS1 *translate) 8822 { 8823 PyObject *item = NULL; 8824 int ret = 0; 8825 8826 if (charmaptranslate_lookup(ch, mapping, &item)) { 8827 return -1; 8828 } 8829 8830 if (item == Py_None) { 8831 /* deletion */ 8832 translate[ch] = 0xfe; 8833 } 8834 else if (item == NULL) { 8835 /* not found => default to 1:1 mapping */ 8836 translate[ch] = ch; 8837 return 1; 8838 } 8839 else if (PyLong_Check(item)) { 8840 long replace = PyLong_AS_LONG(item); 8841 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8842 used it */ 8843 if (127 < replace) { 8844 /* invalid character or character outside ASCII: 8845 skip the fast translate */ 8846 goto exit; 8847 } 8848 translate[ch] = (Py_UCS1)replace; 8849 } 8850 else if (PyUnicode_Check(item)) { 8851 Py_UCS4 replace; 8852 8853 if (PyUnicode_READY(item) == -1) { 8854 Py_DECREF(item); 8855 return -1; 8856 } 8857 if (PyUnicode_GET_LENGTH(item) != 1) 8858 goto exit; 8859 8860 replace = PyUnicode_READ_CHAR(item, 0); 8861 if (replace > 127) 8862 goto exit; 8863 translate[ch] = (Py_UCS1)replace; 8864 } 8865 else { 8866 /* not None, NULL, long or unicode */ 8867 goto exit; 8868 } 8869 ret = 1; 8870 8871 exit: 8872 Py_DECREF(item); 8873 return ret; 8874 } 8875 8876 /* Fast path for ascii => ascii translation. Return 1 if the whole string 8877 was translated into writer, return 0 if the input string was partially 8878 translated into writer, raise an exception and return -1 on error. */ 8879 static int 8880 unicode_fast_translate(PyObject *input, PyObject *mapping, 8881 _PyUnicodeWriter *writer, int ignore, 8882 Py_ssize_t *input_pos) 8883 { 8884 Py_UCS1 ascii_table[128], ch, ch2; 8885 Py_ssize_t len; 8886 Py_UCS1 *in, *end, *out; 8887 int res = 0; 8888 8889 len = PyUnicode_GET_LENGTH(input); 8890 8891 memset(ascii_table, 0xff, 128); 8892 8893 in = PyUnicode_1BYTE_DATA(input); 8894 end = in + len; 8895 8896 assert(PyUnicode_IS_ASCII(writer->buffer)); 8897 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8898 out = PyUnicode_1BYTE_DATA(writer->buffer); 8899 8900 for (; in < end; in++) { 8901 ch = *in; 8902 ch2 = ascii_table[ch]; 8903 if (ch2 == 0xff) { 8904 int translate = unicode_fast_translate_lookup(mapping, ch, 8905 ascii_table); 8906 if (translate < 0) 8907 return -1; 8908 if (translate == 0) 8909 goto exit; 8910 ch2 = ascii_table[ch]; 8911 } 8912 if (ch2 == 0xfe) { 8913 if (ignore) 8914 continue; 8915 goto exit; 8916 } 8917 assert(ch2 < 128); 8918 *out = ch2; 8919 out++; 8920 } 8921 res = 1; 8922 8923 exit: 8924 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8925 *input_pos = in - PyUnicode_1BYTE_DATA(input); 8926 return res; 8927 } 8928 8929 static PyObject * 8930 _PyUnicode_TranslateCharmap(PyObject *input, 8931 PyObject *mapping, 8932 const char *errors) 8933 { 8934 /* input object */ 8935 char *data; 8936 Py_ssize_t size, i; 8937 int kind; 8938 /* output buffer */ 8939 _PyUnicodeWriter writer; 8940 /* error handler */ 8941 const char *reason = "character maps to <undefined>"; 8942 PyObject *errorHandler = NULL; 8943 PyObject *exc = NULL; 8944 int ignore; 8945 int res; 8946 8947 if (mapping == NULL) { 8948 PyErr_BadArgument(); 8949 return NULL; 8950 } 8951 8952 if (PyUnicode_READY(input) == -1) 8953 return NULL; 8954 data = (char*)PyUnicode_DATA(input); 8955 kind = PyUnicode_KIND(input); 8956 size = PyUnicode_GET_LENGTH(input); 8957 8958 if (size == 0) 8959 return PyUnicode_FromObject(input); 8960 8961 /* allocate enough for a simple 1:1 translation without 8962 replacements, if we need more, we'll resize */ 8963 _PyUnicodeWriter_Init(&writer); 8964 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8965 goto onError; 8966 8967 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8968 8969 if (PyUnicode_READY(input) == -1) 8970 return NULL; 8971 if (PyUnicode_IS_ASCII(input)) { 8972 res = unicode_fast_translate(input, mapping, &writer, ignore, &i); 8973 if (res < 0) { 8974 _PyUnicodeWriter_Dealloc(&writer); 8975 return NULL; 8976 } 8977 if (res == 1) 8978 return _PyUnicodeWriter_Finish(&writer); 8979 } 8980 else { 8981 i = 0; 8982 } 8983 8984 while (i<size) { 8985 /* try to encode it */ 8986 int translate; 8987 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8988 Py_ssize_t newpos; 8989 /* startpos for collecting untranslatable chars */ 8990 Py_ssize_t collstart; 8991 Py_ssize_t collend; 8992 Py_UCS4 ch; 8993 8994 ch = PyUnicode_READ(kind, data, i); 8995 translate = charmaptranslate_output(ch, mapping, &writer); 8996 if (translate < 0) 8997 goto onError; 8998 8999 if (translate != 0) { 9000 /* it worked => adjust input pointer */ 9001 ++i; 9002 continue; 9003 } 9004 9005 /* untranslatable character */ 9006 collstart = i; 9007 collend = i+1; 9008 9009 /* find all untranslatable characters */ 9010 while (collend < size) { 9011 PyObject *x; 9012 ch = PyUnicode_READ(kind, data, collend); 9013 if (charmaptranslate_lookup(ch, mapping, &x)) 9014 goto onError; 9015 Py_XDECREF(x); 9016 if (x != Py_None) 9017 break; 9018 ++collend; 9019 } 9020 9021 if (ignore) { 9022 i = collend; 9023 } 9024 else { 9025 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 9026 reason, input, &exc, 9027 collstart, collend, &newpos); 9028 if (repunicode == NULL) 9029 goto onError; 9030 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 9031 Py_DECREF(repunicode); 9032 goto onError; 9033 } 9034 Py_DECREF(repunicode); 9035 i = newpos; 9036 } 9037 } 9038 Py_XDECREF(exc); 9039 Py_XDECREF(errorHandler); 9040 return _PyUnicodeWriter_Finish(&writer); 9041 9042 onError: 9043 _PyUnicodeWriter_Dealloc(&writer); 9044 Py_XDECREF(exc); 9045 Py_XDECREF(errorHandler); 9046 return NULL; 9047 } 9048 9049 /* Deprecated. Use PyUnicode_Translate instead. */ 9050 PyObject * 9051 PyUnicode_TranslateCharmap(const Py_UNICODE *p, 9052 Py_ssize_t size, 9053 PyObject *mapping, 9054 const char *errors) 9055 { 9056 PyObject *result; 9057 PyObject *unicode = PyUnicode_FromWideChar(p, size); 9058 if (!unicode) 9059 return NULL; 9060 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 9061 Py_DECREF(unicode); 9062 return result; 9063 } 9064 9065 PyObject * 9066 PyUnicode_Translate(PyObject *str, 9067 PyObject *mapping, 9068 const char *errors) 9069 { 9070 if (ensure_unicode(str) < 0) 9071 return NULL; 9072 return _PyUnicode_TranslateCharmap(str, mapping, errors); 9073 } 9074 9075 PyObject * 9076 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 9077 { 9078 if (!PyUnicode_Check(unicode)) { 9079 PyErr_BadInternalCall(); 9080 return NULL; 9081 } 9082 if (PyUnicode_READY(unicode) == -1) 9083 return NULL; 9084 if (PyUnicode_IS_ASCII(unicode)) { 9085 /* If the string is already ASCII, just return the same string */ 9086 Py_INCREF(unicode); 9087 return unicode; 9088 } 9089 9090 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); 9091 PyObject *result = PyUnicode_New(len, 127); 9092 if (result == NULL) { 9093 return NULL; 9094 } 9095 9096 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result); 9097 int kind = PyUnicode_KIND(unicode); 9098 const void *data = PyUnicode_DATA(unicode); 9099 Py_ssize_t i; 9100 for (i = 0; i < len; ++i) { 9101 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9102 if (ch < 127) { 9103 out[i] = ch; 9104 } 9105 else if (Py_UNICODE_ISSPACE(ch)) { 9106 out[i] = ' '; 9107 } 9108 else { 9109 int decimal = Py_UNICODE_TODECIMAL(ch); 9110 if (decimal < 0) { 9111 out[i] = '?'; 9112 out[i+1] = '\0'; 9113 _PyUnicode_LENGTH(result) = i + 1; 9114 break; 9115 } 9116 out[i] = '0' + decimal; 9117 } 9118 } 9119 9120 assert(_PyUnicode_CheckConsistency(result, 1)); 9121 return result; 9122 } 9123 9124 PyObject * 9125 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 9126 Py_ssize_t length) 9127 { 9128 PyObject *decimal; 9129 Py_ssize_t i; 9130 Py_UCS4 maxchar; 9131 enum PyUnicode_Kind kind; 9132 void *data; 9133 9134 maxchar = 127; 9135 for (i = 0; i < length; i++) { 9136 Py_UCS4 ch = s[i]; 9137 if (ch > 127) { 9138 int decimal = Py_UNICODE_TODECIMAL(ch); 9139 if (decimal >= 0) 9140 ch = '0' + decimal; 9141 maxchar = Py_MAX(maxchar, ch); 9142 } 9143 } 9144 9145 /* Copy to a new string */ 9146 decimal = PyUnicode_New(length, maxchar); 9147 if (decimal == NULL) 9148 return decimal; 9149 kind = PyUnicode_KIND(decimal); 9150 data = PyUnicode_DATA(decimal); 9151 /* Iterate over code points */ 9152 for (i = 0; i < length; i++) { 9153 Py_UCS4 ch = s[i]; 9154 if (ch > 127) { 9155 int decimal = Py_UNICODE_TODECIMAL(ch); 9156 if (decimal >= 0) 9157 ch = '0' + decimal; 9158 } 9159 PyUnicode_WRITE(kind, data, i, ch); 9160 } 9161 return unicode_result(decimal); 9162 } 9163 /* --- Decimal Encoder ---------------------------------------------------- */ 9164 9165 int 9166 PyUnicode_EncodeDecimal(Py_UNICODE *s, 9167 Py_ssize_t length, 9168 char *output, 9169 const char *errors) 9170 { 9171 PyObject *unicode; 9172 Py_ssize_t i; 9173 enum PyUnicode_Kind kind; 9174 void *data; 9175 9176 if (output == NULL) { 9177 PyErr_BadArgument(); 9178 return -1; 9179 } 9180 9181 unicode = PyUnicode_FromWideChar(s, length); 9182 if (unicode == NULL) 9183 return -1; 9184 9185 kind = PyUnicode_KIND(unicode); 9186 data = PyUnicode_DATA(unicode); 9187 9188 for (i=0; i < length; ) { 9189 PyObject *exc; 9190 Py_UCS4 ch; 9191 int decimal; 9192 Py_ssize_t startpos; 9193 9194 ch = PyUnicode_READ(kind, data, i); 9195 9196 if (Py_UNICODE_ISSPACE(ch)) { 9197 *output++ = ' '; 9198 i++; 9199 continue; 9200 } 9201 decimal = Py_UNICODE_TODECIMAL(ch); 9202 if (decimal >= 0) { 9203 *output++ = '0' + decimal; 9204 i++; 9205 continue; 9206 } 9207 if (0 < ch && ch < 256) { 9208 *output++ = (char)ch; 9209 i++; 9210 continue; 9211 } 9212 9213 startpos = i; 9214 exc = NULL; 9215 raise_encode_exception(&exc, "decimal", unicode, 9216 startpos, startpos+1, 9217 "invalid decimal Unicode string"); 9218 Py_XDECREF(exc); 9219 Py_DECREF(unicode); 9220 return -1; 9221 } 9222 /* 0-terminate the output string */ 9223 *output++ = '\0'; 9224 Py_DECREF(unicode); 9225 return 0; 9226 } 9227 9228 /* --- Helpers ------------------------------------------------------------ */ 9229 9230 /* helper macro to fixup start/end slice values */ 9231 #define ADJUST_INDICES(start, end, len) \ 9232 if (end > len) \ 9233 end = len; \ 9234 else if (end < 0) { \ 9235 end += len; \ 9236 if (end < 0) \ 9237 end = 0; \ 9238 } \ 9239 if (start < 0) { \ 9240 start += len; \ 9241 if (start < 0) \ 9242 start = 0; \ 9243 } 9244 9245 static Py_ssize_t 9246 any_find_slice(PyObject* s1, PyObject* s2, 9247 Py_ssize_t start, 9248 Py_ssize_t end, 9249 int direction) 9250 { 9251 int kind1, kind2; 9252 void *buf1, *buf2; 9253 Py_ssize_t len1, len2, result; 9254 9255 kind1 = PyUnicode_KIND(s1); 9256 kind2 = PyUnicode_KIND(s2); 9257 if (kind1 < kind2) 9258 return -1; 9259 9260 len1 = PyUnicode_GET_LENGTH(s1); 9261 len2 = PyUnicode_GET_LENGTH(s2); 9262 ADJUST_INDICES(start, end, len1); 9263 if (end - start < len2) 9264 return -1; 9265 9266 buf1 = PyUnicode_DATA(s1); 9267 buf2 = PyUnicode_DATA(s2); 9268 if (len2 == 1) { 9269 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9270 result = findchar((const char *)buf1 + kind1*start, 9271 kind1, end - start, ch, direction); 9272 if (result == -1) 9273 return -1; 9274 else 9275 return start + result; 9276 } 9277 9278 if (kind2 != kind1) { 9279 buf2 = _PyUnicode_AsKind(s2, kind1); 9280 if (!buf2) 9281 return -2; 9282 } 9283 9284 if (direction > 0) { 9285 switch (kind1) { 9286 case PyUnicode_1BYTE_KIND: 9287 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9288 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9289 else 9290 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9291 break; 9292 case PyUnicode_2BYTE_KIND: 9293 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9294 break; 9295 case PyUnicode_4BYTE_KIND: 9296 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9297 break; 9298 default: 9299 Py_UNREACHABLE(); 9300 } 9301 } 9302 else { 9303 switch (kind1) { 9304 case PyUnicode_1BYTE_KIND: 9305 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9306 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9307 else 9308 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9309 break; 9310 case PyUnicode_2BYTE_KIND: 9311 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9312 break; 9313 case PyUnicode_4BYTE_KIND: 9314 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9315 break; 9316 default: 9317 Py_UNREACHABLE(); 9318 } 9319 } 9320 9321 if (kind2 != kind1) 9322 PyMem_Free(buf2); 9323 9324 return result; 9325 } 9326 9327 /* _PyUnicode_InsertThousandsGrouping() helper functions */ 9328 #include "stringlib/localeutil.h" 9329 9330 /** 9331 * InsertThousandsGrouping: 9332 * @writer: Unicode writer. 9333 * @n_buffer: Number of characters in @buffer. 9334 * @digits: Digits we're reading from. If count is non-NULL, this is unused. 9335 * @d_pos: Start of digits string. 9336 * @n_digits: The number of digits in the string, in which we want 9337 * to put the grouping chars. 9338 * @min_width: The minimum width of the digits in the output string. 9339 * Output will be zero-padded on the left to fill. 9340 * @grouping: see definition in localeconv(). 9341 * @thousands_sep: see definition in localeconv(). 9342 * 9343 * There are 2 modes: counting and filling. If @writer is NULL, 9344 * we are in counting mode, else filling mode. 9345 * If counting, the required buffer size is returned. 9346 * If filling, we know the buffer will be large enough, so we don't 9347 * need to pass in the buffer size. 9348 * Inserts thousand grouping characters (as defined by grouping and 9349 * thousands_sep) into @writer. 9350 * 9351 * Return value: -1 on error, number of characters otherwise. 9352 **/ 9353 Py_ssize_t 9354 _PyUnicode_InsertThousandsGrouping( 9355 _PyUnicodeWriter *writer, 9356 Py_ssize_t n_buffer, 9357 PyObject *digits, 9358 Py_ssize_t d_pos, 9359 Py_ssize_t n_digits, 9360 Py_ssize_t min_width, 9361 const char *grouping, 9362 PyObject *thousands_sep, 9363 Py_UCS4 *maxchar) 9364 { 9365 min_width = Py_MAX(0, min_width); 9366 if (writer) { 9367 assert(digits != NULL); 9368 assert(maxchar == NULL); 9369 } 9370 else { 9371 assert(digits == NULL); 9372 assert(maxchar != NULL); 9373 } 9374 assert(0 <= d_pos); 9375 assert(0 <= n_digits); 9376 assert(grouping != NULL); 9377 9378 if (digits != NULL) { 9379 if (PyUnicode_READY(digits) == -1) { 9380 return -1; 9381 } 9382 } 9383 if (PyUnicode_READY(thousands_sep) == -1) { 9384 return -1; 9385 } 9386 9387 Py_ssize_t count = 0; 9388 Py_ssize_t n_zeros; 9389 int loop_broken = 0; 9390 int use_separator = 0; /* First time through, don't append the 9391 separator. They only go between 9392 groups. */ 9393 Py_ssize_t buffer_pos; 9394 Py_ssize_t digits_pos; 9395 Py_ssize_t len; 9396 Py_ssize_t n_chars; 9397 Py_ssize_t remaining = n_digits; /* Number of chars remaining to 9398 be looked at */ 9399 /* A generator that returns all of the grouping widths, until it 9400 returns 0. */ 9401 GroupGenerator groupgen; 9402 GroupGenerator_init(&groupgen, grouping); 9403 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9404 9405 /* if digits are not grouped, thousands separator 9406 should be an empty string */ 9407 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0)); 9408 9409 digits_pos = d_pos + n_digits; 9410 if (writer) { 9411 buffer_pos = writer->pos + n_buffer; 9412 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer)); 9413 assert(digits_pos <= PyUnicode_GET_LENGTH(digits)); 9414 } 9415 else { 9416 buffer_pos = n_buffer; 9417 } 9418 9419 if (!writer) { 9420 *maxchar = 127; 9421 } 9422 9423 while ((len = GroupGenerator_next(&groupgen)) > 0) { 9424 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1)); 9425 n_zeros = Py_MAX(0, len - remaining); 9426 n_chars = Py_MAX(0, Py_MIN(remaining, len)); 9427 9428 /* Use n_zero zero's and n_chars chars */ 9429 9430 /* Count only, don't do anything. */ 9431 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; 9432 9433 /* Copy into the writer. */ 9434 InsertThousandsGrouping_fill(writer, &buffer_pos, 9435 digits, &digits_pos, 9436 n_chars, n_zeros, 9437 use_separator ? thousands_sep : NULL, 9438 thousands_sep_len, maxchar); 9439 9440 /* Use a separator next time. */ 9441 use_separator = 1; 9442 9443 remaining -= n_chars; 9444 min_width -= len; 9445 9446 if (remaining <= 0 && min_width <= 0) { 9447 loop_broken = 1; 9448 break; 9449 } 9450 min_width -= thousands_sep_len; 9451 } 9452 if (!loop_broken) { 9453 /* We left the loop without using a break statement. */ 9454 9455 len = Py_MAX(Py_MAX(remaining, min_width), 1); 9456 n_zeros = Py_MAX(0, len - remaining); 9457 n_chars = Py_MAX(0, Py_MIN(remaining, len)); 9458 9459 /* Use n_zero zero's and n_chars chars */ 9460 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; 9461 9462 /* Copy into the writer. */ 9463 InsertThousandsGrouping_fill(writer, &buffer_pos, 9464 digits, &digits_pos, 9465 n_chars, n_zeros, 9466 use_separator ? thousands_sep : NULL, 9467 thousands_sep_len, maxchar); 9468 } 9469 return count; 9470 } 9471 9472 9473 Py_ssize_t 9474 PyUnicode_Count(PyObject *str, 9475 PyObject *substr, 9476 Py_ssize_t start, 9477 Py_ssize_t end) 9478 { 9479 Py_ssize_t result; 9480 int kind1, kind2; 9481 void *buf1 = NULL, *buf2 = NULL; 9482 Py_ssize_t len1, len2; 9483 9484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9485 return -1; 9486 9487 kind1 = PyUnicode_KIND(str); 9488 kind2 = PyUnicode_KIND(substr); 9489 if (kind1 < kind2) 9490 return 0; 9491 9492 len1 = PyUnicode_GET_LENGTH(str); 9493 len2 = PyUnicode_GET_LENGTH(substr); 9494 ADJUST_INDICES(start, end, len1); 9495 if (end - start < len2) 9496 return 0; 9497 9498 buf1 = PyUnicode_DATA(str); 9499 buf2 = PyUnicode_DATA(substr); 9500 if (kind2 != kind1) { 9501 buf2 = _PyUnicode_AsKind(substr, kind1); 9502 if (!buf2) 9503 goto onError; 9504 } 9505 9506 switch (kind1) { 9507 case PyUnicode_1BYTE_KIND: 9508 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 9509 result = asciilib_count( 9510 ((Py_UCS1*)buf1) + start, end - start, 9511 buf2, len2, PY_SSIZE_T_MAX 9512 ); 9513 else 9514 result = ucs1lib_count( 9515 ((Py_UCS1*)buf1) + start, end - start, 9516 buf2, len2, PY_SSIZE_T_MAX 9517 ); 9518 break; 9519 case PyUnicode_2BYTE_KIND: 9520 result = ucs2lib_count( 9521 ((Py_UCS2*)buf1) + start, end - start, 9522 buf2, len2, PY_SSIZE_T_MAX 9523 ); 9524 break; 9525 case PyUnicode_4BYTE_KIND: 9526 result = ucs4lib_count( 9527 ((Py_UCS4*)buf1) + start, end - start, 9528 buf2, len2, PY_SSIZE_T_MAX 9529 ); 9530 break; 9531 default: 9532 Py_UNREACHABLE(); 9533 } 9534 9535 if (kind2 != kind1) 9536 PyMem_Free(buf2); 9537 9538 return result; 9539 onError: 9540 if (kind2 != kind1 && buf2) 9541 PyMem_Free(buf2); 9542 return -1; 9543 } 9544 9545 Py_ssize_t 9546 PyUnicode_Find(PyObject *str, 9547 PyObject *substr, 9548 Py_ssize_t start, 9549 Py_ssize_t end, 9550 int direction) 9551 { 9552 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9553 return -2; 9554 9555 return any_find_slice(str, substr, start, end, direction); 9556 } 9557 9558 Py_ssize_t 9559 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9560 Py_ssize_t start, Py_ssize_t end, 9561 int direction) 9562 { 9563 int kind; 9564 Py_ssize_t len, result; 9565 if (PyUnicode_READY(str) == -1) 9566 return -2; 9567 len = PyUnicode_GET_LENGTH(str); 9568 ADJUST_INDICES(start, end, len); 9569 if (end - start < 1) 9570 return -1; 9571 kind = PyUnicode_KIND(str); 9572 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9573 kind, end-start, ch, direction); 9574 if (result == -1) 9575 return -1; 9576 else 9577 return start + result; 9578 } 9579 9580 static int 9581 tailmatch(PyObject *self, 9582 PyObject *substring, 9583 Py_ssize_t start, 9584 Py_ssize_t end, 9585 int direction) 9586 { 9587 int kind_self; 9588 int kind_sub; 9589 void *data_self; 9590 void *data_sub; 9591 Py_ssize_t offset; 9592 Py_ssize_t i; 9593 Py_ssize_t end_sub; 9594 9595 if (PyUnicode_READY(self) == -1 || 9596 PyUnicode_READY(substring) == -1) 9597 return -1; 9598 9599 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9600 end -= PyUnicode_GET_LENGTH(substring); 9601 if (end < start) 9602 return 0; 9603 9604 if (PyUnicode_GET_LENGTH(substring) == 0) 9605 return 1; 9606 9607 kind_self = PyUnicode_KIND(self); 9608 data_self = PyUnicode_DATA(self); 9609 kind_sub = PyUnicode_KIND(substring); 9610 data_sub = PyUnicode_DATA(substring); 9611 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9612 9613 if (direction > 0) 9614 offset = end; 9615 else 9616 offset = start; 9617 9618 if (PyUnicode_READ(kind_self, data_self, offset) == 9619 PyUnicode_READ(kind_sub, data_sub, 0) && 9620 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9621 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9622 /* If both are of the same kind, memcmp is sufficient */ 9623 if (kind_self == kind_sub) { 9624 return ! memcmp((char *)data_self + 9625 (offset * PyUnicode_KIND(substring)), 9626 data_sub, 9627 PyUnicode_GET_LENGTH(substring) * 9628 PyUnicode_KIND(substring)); 9629 } 9630 /* otherwise we have to compare each character by first accessing it */ 9631 else { 9632 /* We do not need to compare 0 and len(substring)-1 because 9633 the if statement above ensured already that they are equal 9634 when we end up here. */ 9635 for (i = 1; i < end_sub; ++i) { 9636 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9637 PyUnicode_READ(kind_sub, data_sub, i)) 9638 return 0; 9639 } 9640 return 1; 9641 } 9642 } 9643 9644 return 0; 9645 } 9646 9647 Py_ssize_t 9648 PyUnicode_Tailmatch(PyObject *str, 9649 PyObject *substr, 9650 Py_ssize_t start, 9651 Py_ssize_t end, 9652 int direction) 9653 { 9654 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 9655 return -1; 9656 9657 return tailmatch(str, substr, start, end, direction); 9658 } 9659 9660 static PyObject * 9661 ascii_upper_or_lower(PyObject *self, int lower) 9662 { 9663 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9664 char *resdata, *data = PyUnicode_DATA(self); 9665 PyObject *res; 9666 9667 res = PyUnicode_New(len, 127); 9668 if (res == NULL) 9669 return NULL; 9670 resdata = PyUnicode_DATA(res); 9671 if (lower) 9672 _Py_bytes_lower(resdata, data, len); 9673 else 9674 _Py_bytes_upper(resdata, data, len); 9675 return res; 9676 } 9677 9678 static Py_UCS4 9679 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9680 { 9681 Py_ssize_t j; 9682 int final_sigma; 9683 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9684 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9685 9686 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9687 9688 where ! is a negation and \p{xxx} is a character with property xxx. 9689 */ 9690 for (j = i - 1; j >= 0; j--) { 9691 c = PyUnicode_READ(kind, data, j); 9692 if (!_PyUnicode_IsCaseIgnorable(c)) 9693 break; 9694 } 9695 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9696 if (final_sigma) { 9697 for (j = i + 1; j < length; j++) { 9698 c = PyUnicode_READ(kind, data, j); 9699 if (!_PyUnicode_IsCaseIgnorable(c)) 9700 break; 9701 } 9702 final_sigma = j == length || !_PyUnicode_IsCased(c); 9703 } 9704 return (final_sigma) ? 0x3C2 : 0x3C3; 9705 } 9706 9707 static int 9708 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9709 Py_UCS4 c, Py_UCS4 *mapped) 9710 { 9711 /* Obscure special case. */ 9712 if (c == 0x3A3) { 9713 mapped[0] = handle_capital_sigma(kind, data, length, i); 9714 return 1; 9715 } 9716 return _PyUnicode_ToLowerFull(c, mapped); 9717 } 9718 9719 static Py_ssize_t 9720 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9721 { 9722 Py_ssize_t i, k = 0; 9723 int n_res, j; 9724 Py_UCS4 c, mapped[3]; 9725 9726 c = PyUnicode_READ(kind, data, 0); 9727 n_res = _PyUnicode_ToUpperFull(c, mapped); 9728 for (j = 0; j < n_res; j++) { 9729 *maxchar = Py_MAX(*maxchar, mapped[j]); 9730 res[k++] = mapped[j]; 9731 } 9732 for (i = 1; i < length; i++) { 9733 c = PyUnicode_READ(kind, data, i); 9734 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9735 for (j = 0; j < n_res; j++) { 9736 *maxchar = Py_MAX(*maxchar, mapped[j]); 9737 res[k++] = mapped[j]; 9738 } 9739 } 9740 return k; 9741 } 9742 9743 static Py_ssize_t 9744 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9745 Py_ssize_t i, k = 0; 9746 9747 for (i = 0; i < length; i++) { 9748 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9749 int n_res, j; 9750 if (Py_UNICODE_ISUPPER(c)) { 9751 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9752 } 9753 else if (Py_UNICODE_ISLOWER(c)) { 9754 n_res = _PyUnicode_ToUpperFull(c, mapped); 9755 } 9756 else { 9757 n_res = 1; 9758 mapped[0] = c; 9759 } 9760 for (j = 0; j < n_res; j++) { 9761 *maxchar = Py_MAX(*maxchar, mapped[j]); 9762 res[k++] = mapped[j]; 9763 } 9764 } 9765 return k; 9766 } 9767 9768 static Py_ssize_t 9769 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9770 Py_UCS4 *maxchar, int lower) 9771 { 9772 Py_ssize_t i, k = 0; 9773 9774 for (i = 0; i < length; i++) { 9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9776 int n_res, j; 9777 if (lower) 9778 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9779 else 9780 n_res = _PyUnicode_ToUpperFull(c, mapped); 9781 for (j = 0; j < n_res; j++) { 9782 *maxchar = Py_MAX(*maxchar, mapped[j]); 9783 res[k++] = mapped[j]; 9784 } 9785 } 9786 return k; 9787 } 9788 9789 static Py_ssize_t 9790 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9791 { 9792 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9793 } 9794 9795 static Py_ssize_t 9796 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9797 { 9798 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9799 } 9800 9801 static Py_ssize_t 9802 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9803 { 9804 Py_ssize_t i, k = 0; 9805 9806 for (i = 0; i < length; i++) { 9807 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9808 Py_UCS4 mapped[3]; 9809 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9810 for (j = 0; j < n_res; j++) { 9811 *maxchar = Py_MAX(*maxchar, mapped[j]); 9812 res[k++] = mapped[j]; 9813 } 9814 } 9815 return k; 9816 } 9817 9818 static Py_ssize_t 9819 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9820 { 9821 Py_ssize_t i, k = 0; 9822 int previous_is_cased; 9823 9824 previous_is_cased = 0; 9825 for (i = 0; i < length; i++) { 9826 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9827 Py_UCS4 mapped[3]; 9828 int n_res, j; 9829 9830 if (previous_is_cased) 9831 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9832 else 9833 n_res = _PyUnicode_ToTitleFull(c, mapped); 9834 9835 for (j = 0; j < n_res; j++) { 9836 *maxchar = Py_MAX(*maxchar, mapped[j]); 9837 res[k++] = mapped[j]; 9838 } 9839 9840 previous_is_cased = _PyUnicode_IsCased(c); 9841 } 9842 return k; 9843 } 9844 9845 static PyObject * 9846 case_operation(PyObject *self, 9847 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9848 { 9849 PyObject *res = NULL; 9850 Py_ssize_t length, newlength = 0; 9851 int kind, outkind; 9852 void *data, *outdata; 9853 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9854 9855 assert(PyUnicode_IS_READY(self)); 9856 9857 kind = PyUnicode_KIND(self); 9858 data = PyUnicode_DATA(self); 9859 length = PyUnicode_GET_LENGTH(self); 9860 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9861 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9862 return NULL; 9863 } 9864 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9865 if (tmp == NULL) 9866 return PyErr_NoMemory(); 9867 newlength = perform(kind, data, length, tmp, &maxchar); 9868 res = PyUnicode_New(newlength, maxchar); 9869 if (res == NULL) 9870 goto leave; 9871 tmpend = tmp + newlength; 9872 outdata = PyUnicode_DATA(res); 9873 outkind = PyUnicode_KIND(res); 9874 switch (outkind) { 9875 case PyUnicode_1BYTE_KIND: 9876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9877 break; 9878 case PyUnicode_2BYTE_KIND: 9879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9880 break; 9881 case PyUnicode_4BYTE_KIND: 9882 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9883 break; 9884 default: 9885 Py_UNREACHABLE(); 9886 } 9887 leave: 9888 PyMem_FREE(tmp); 9889 return res; 9890 } 9891 9892 PyObject * 9893 PyUnicode_Join(PyObject *separator, PyObject *seq) 9894 { 9895 PyObject *res; 9896 PyObject *fseq; 9897 Py_ssize_t seqlen; 9898 PyObject **items; 9899 9900 fseq = PySequence_Fast(seq, "can only join an iterable"); 9901 if (fseq == NULL) { 9902 return NULL; 9903 } 9904 9905 /* NOTE: the following code can't call back into Python code, 9906 * so we are sure that fseq won't be mutated. 9907 */ 9908 9909 items = PySequence_Fast_ITEMS(fseq); 9910 seqlen = PySequence_Fast_GET_SIZE(fseq); 9911 res = _PyUnicode_JoinArray(separator, items, seqlen); 9912 Py_DECREF(fseq); 9913 return res; 9914 } 9915 9916 PyObject * 9917 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen) 9918 { 9919 PyObject *res = NULL; /* the result */ 9920 PyObject *sep = NULL; 9921 Py_ssize_t seplen; 9922 PyObject *item; 9923 Py_ssize_t sz, i, res_offset; 9924 Py_UCS4 maxchar; 9925 Py_UCS4 item_maxchar; 9926 int use_memcpy; 9927 unsigned char *res_data = NULL, *sep_data = NULL; 9928 PyObject *last_obj; 9929 unsigned int kind = 0; 9930 9931 /* If empty sequence, return u"". */ 9932 if (seqlen == 0) { 9933 _Py_RETURN_UNICODE_EMPTY(); 9934 } 9935 9936 /* If singleton sequence with an exact Unicode, return that. */ 9937 last_obj = NULL; 9938 if (seqlen == 1) { 9939 if (PyUnicode_CheckExact(items[0])) { 9940 res = items[0]; 9941 Py_INCREF(res); 9942 return res; 9943 } 9944 seplen = 0; 9945 maxchar = 0; 9946 } 9947 else { 9948 /* Set up sep and seplen */ 9949 if (separator == NULL) { 9950 /* fall back to a blank space separator */ 9951 sep = PyUnicode_FromOrdinal(' '); 9952 if (!sep) 9953 goto onError; 9954 seplen = 1; 9955 maxchar = 32; 9956 } 9957 else { 9958 if (!PyUnicode_Check(separator)) { 9959 PyErr_Format(PyExc_TypeError, 9960 "separator: expected str instance," 9961 " %.80s found", 9962 Py_TYPE(separator)->tp_name); 9963 goto onError; 9964 } 9965 if (PyUnicode_READY(separator)) 9966 goto onError; 9967 sep = separator; 9968 seplen = PyUnicode_GET_LENGTH(separator); 9969 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9970 /* inc refcount to keep this code path symmetric with the 9971 above case of a blank separator */ 9972 Py_INCREF(sep); 9973 } 9974 last_obj = sep; 9975 } 9976 9977 /* There are at least two things to join, or else we have a subclass 9978 * of str in the sequence. 9979 * Do a pre-pass to figure out the total amount of space we'll 9980 * need (sz), and see whether all argument are strings. 9981 */ 9982 sz = 0; 9983 #ifdef Py_DEBUG 9984 use_memcpy = 0; 9985 #else 9986 use_memcpy = 1; 9987 #endif 9988 for (i = 0; i < seqlen; i++) { 9989 size_t add_sz; 9990 item = items[i]; 9991 if (!PyUnicode_Check(item)) { 9992 PyErr_Format(PyExc_TypeError, 9993 "sequence item %zd: expected str instance," 9994 " %.80s found", 9995 i, Py_TYPE(item)->tp_name); 9996 goto onError; 9997 } 9998 if (PyUnicode_READY(item) == -1) 9999 goto onError; 10000 add_sz = PyUnicode_GET_LENGTH(item); 10001 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 10002 maxchar = Py_MAX(maxchar, item_maxchar); 10003 if (i != 0) { 10004 add_sz += seplen; 10005 } 10006 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) { 10007 PyErr_SetString(PyExc_OverflowError, 10008 "join() result is too long for a Python string"); 10009 goto onError; 10010 } 10011 sz += add_sz; 10012 if (use_memcpy && last_obj != NULL) { 10013 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 10014 use_memcpy = 0; 10015 } 10016 last_obj = item; 10017 } 10018 10019 res = PyUnicode_New(sz, maxchar); 10020 if (res == NULL) 10021 goto onError; 10022 10023 /* Catenate everything. */ 10024 #ifdef Py_DEBUG 10025 use_memcpy = 0; 10026 #else 10027 if (use_memcpy) { 10028 res_data = PyUnicode_1BYTE_DATA(res); 10029 kind = PyUnicode_KIND(res); 10030 if (seplen != 0) 10031 sep_data = PyUnicode_1BYTE_DATA(sep); 10032 } 10033 #endif 10034 if (use_memcpy) { 10035 for (i = 0; i < seqlen; ++i) { 10036 Py_ssize_t itemlen; 10037 item = items[i]; 10038 10039 /* Copy item, and maybe the separator. */ 10040 if (i && seplen != 0) { 10041 memcpy(res_data, 10042 sep_data, 10043 kind * seplen); 10044 res_data += kind * seplen; 10045 } 10046 10047 itemlen = PyUnicode_GET_LENGTH(item); 10048 if (itemlen != 0) { 10049 memcpy(res_data, 10050 PyUnicode_DATA(item), 10051 kind * itemlen); 10052 res_data += kind * itemlen; 10053 } 10054 } 10055 assert(res_data == PyUnicode_1BYTE_DATA(res) 10056 + kind * PyUnicode_GET_LENGTH(res)); 10057 } 10058 else { 10059 for (i = 0, res_offset = 0; i < seqlen; ++i) { 10060 Py_ssize_t itemlen; 10061 item = items[i]; 10062 10063 /* Copy item, and maybe the separator. */ 10064 if (i && seplen != 0) { 10065 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 10066 res_offset += seplen; 10067 } 10068 10069 itemlen = PyUnicode_GET_LENGTH(item); 10070 if (itemlen != 0) { 10071 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 10072 res_offset += itemlen; 10073 } 10074 } 10075 assert(res_offset == PyUnicode_GET_LENGTH(res)); 10076 } 10077 10078 Py_XDECREF(sep); 10079 assert(_PyUnicode_CheckConsistency(res, 1)); 10080 return res; 10081 10082 onError: 10083 Py_XDECREF(sep); 10084 Py_XDECREF(res); 10085 return NULL; 10086 } 10087 10088 void 10089 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10090 Py_UCS4 fill_char) 10091 { 10092 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 10093 void *data = PyUnicode_DATA(unicode); 10094 assert(PyUnicode_IS_READY(unicode)); 10095 assert(unicode_modifiable(unicode)); 10096 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 10097 assert(start >= 0); 10098 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 10099 FILL(kind, data, fill_char, start, length); 10100 } 10101 10102 Py_ssize_t 10103 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 10104 Py_UCS4 fill_char) 10105 { 10106 Py_ssize_t maxlen; 10107 10108 if (!PyUnicode_Check(unicode)) { 10109 PyErr_BadInternalCall(); 10110 return -1; 10111 } 10112 if (PyUnicode_READY(unicode) == -1) 10113 return -1; 10114 if (unicode_check_modifiable(unicode)) 10115 return -1; 10116 10117 if (start < 0) { 10118 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10119 return -1; 10120 } 10121 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 10122 PyErr_SetString(PyExc_ValueError, 10123 "fill character is bigger than " 10124 "the string maximum character"); 10125 return -1; 10126 } 10127 10128 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 10129 length = Py_MIN(maxlen, length); 10130 if (length <= 0) 10131 return 0; 10132 10133 _PyUnicode_FastFill(unicode, start, length, fill_char); 10134 return length; 10135 } 10136 10137 static PyObject * 10138 pad(PyObject *self, 10139 Py_ssize_t left, 10140 Py_ssize_t right, 10141 Py_UCS4 fill) 10142 { 10143 PyObject *u; 10144 Py_UCS4 maxchar; 10145 int kind; 10146 void *data; 10147 10148 if (left < 0) 10149 left = 0; 10150 if (right < 0) 10151 right = 0; 10152 10153 if (left == 0 && right == 0) 10154 return unicode_result_unchanged(self); 10155 10156 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 10157 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 10158 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 10159 return NULL; 10160 } 10161 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10162 maxchar = Py_MAX(maxchar, fill); 10163 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10164 if (!u) 10165 return NULL; 10166 10167 kind = PyUnicode_KIND(u); 10168 data = PyUnicode_DATA(u); 10169 if (left) 10170 FILL(kind, data, fill, 0, left); 10171 if (right) 10172 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10173 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10174 assert(_PyUnicode_CheckConsistency(u, 1)); 10175 return u; 10176 } 10177 10178 PyObject * 10179 PyUnicode_Splitlines(PyObject *string, int keepends) 10180 { 10181 PyObject *list; 10182 10183 if (ensure_unicode(string) < 0) 10184 return NULL; 10185 10186 switch (PyUnicode_KIND(string)) { 10187 case PyUnicode_1BYTE_KIND: 10188 if (PyUnicode_IS_ASCII(string)) 10189 list = asciilib_splitlines( 10190 string, PyUnicode_1BYTE_DATA(string), 10191 PyUnicode_GET_LENGTH(string), keepends); 10192 else 10193 list = ucs1lib_splitlines( 10194 string, PyUnicode_1BYTE_DATA(string), 10195 PyUnicode_GET_LENGTH(string), keepends); 10196 break; 10197 case PyUnicode_2BYTE_KIND: 10198 list = ucs2lib_splitlines( 10199 string, PyUnicode_2BYTE_DATA(string), 10200 PyUnicode_GET_LENGTH(string), keepends); 10201 break; 10202 case PyUnicode_4BYTE_KIND: 10203 list = ucs4lib_splitlines( 10204 string, PyUnicode_4BYTE_DATA(string), 10205 PyUnicode_GET_LENGTH(string), keepends); 10206 break; 10207 default: 10208 Py_UNREACHABLE(); 10209 } 10210 return list; 10211 } 10212 10213 static PyObject * 10214 split(PyObject *self, 10215 PyObject *substring, 10216 Py_ssize_t maxcount) 10217 { 10218 int kind1, kind2; 10219 void *buf1, *buf2; 10220 Py_ssize_t len1, len2; 10221 PyObject* out; 10222 10223 if (maxcount < 0) 10224 maxcount = PY_SSIZE_T_MAX; 10225 10226 if (PyUnicode_READY(self) == -1) 10227 return NULL; 10228 10229 if (substring == NULL) 10230 switch (PyUnicode_KIND(self)) { 10231 case PyUnicode_1BYTE_KIND: 10232 if (PyUnicode_IS_ASCII(self)) 10233 return asciilib_split_whitespace( 10234 self, PyUnicode_1BYTE_DATA(self), 10235 PyUnicode_GET_LENGTH(self), maxcount 10236 ); 10237 else 10238 return ucs1lib_split_whitespace( 10239 self, PyUnicode_1BYTE_DATA(self), 10240 PyUnicode_GET_LENGTH(self), maxcount 10241 ); 10242 case PyUnicode_2BYTE_KIND: 10243 return ucs2lib_split_whitespace( 10244 self, PyUnicode_2BYTE_DATA(self), 10245 PyUnicode_GET_LENGTH(self), maxcount 10246 ); 10247 case PyUnicode_4BYTE_KIND: 10248 return ucs4lib_split_whitespace( 10249 self, PyUnicode_4BYTE_DATA(self), 10250 PyUnicode_GET_LENGTH(self), maxcount 10251 ); 10252 default: 10253 Py_UNREACHABLE(); 10254 } 10255 10256 if (PyUnicode_READY(substring) == -1) 10257 return NULL; 10258 10259 kind1 = PyUnicode_KIND(self); 10260 kind2 = PyUnicode_KIND(substring); 10261 len1 = PyUnicode_GET_LENGTH(self); 10262 len2 = PyUnicode_GET_LENGTH(substring); 10263 if (kind1 < kind2 || len1 < len2) { 10264 out = PyList_New(1); 10265 if (out == NULL) 10266 return NULL; 10267 Py_INCREF(self); 10268 PyList_SET_ITEM(out, 0, self); 10269 return out; 10270 } 10271 buf1 = PyUnicode_DATA(self); 10272 buf2 = PyUnicode_DATA(substring); 10273 if (kind2 != kind1) { 10274 buf2 = _PyUnicode_AsKind(substring, kind1); 10275 if (!buf2) 10276 return NULL; 10277 } 10278 10279 switch (kind1) { 10280 case PyUnicode_1BYTE_KIND: 10281 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10282 out = asciilib_split( 10283 self, buf1, len1, buf2, len2, maxcount); 10284 else 10285 out = ucs1lib_split( 10286 self, buf1, len1, buf2, len2, maxcount); 10287 break; 10288 case PyUnicode_2BYTE_KIND: 10289 out = ucs2lib_split( 10290 self, buf1, len1, buf2, len2, maxcount); 10291 break; 10292 case PyUnicode_4BYTE_KIND: 10293 out = ucs4lib_split( 10294 self, buf1, len1, buf2, len2, maxcount); 10295 break; 10296 default: 10297 out = NULL; 10298 } 10299 if (kind2 != kind1) 10300 PyMem_Free(buf2); 10301 return out; 10302 } 10303 10304 static PyObject * 10305 rsplit(PyObject *self, 10306 PyObject *substring, 10307 Py_ssize_t maxcount) 10308 { 10309 int kind1, kind2; 10310 void *buf1, *buf2; 10311 Py_ssize_t len1, len2; 10312 PyObject* out; 10313 10314 if (maxcount < 0) 10315 maxcount = PY_SSIZE_T_MAX; 10316 10317 if (PyUnicode_READY(self) == -1) 10318 return NULL; 10319 10320 if (substring == NULL) 10321 switch (PyUnicode_KIND(self)) { 10322 case PyUnicode_1BYTE_KIND: 10323 if (PyUnicode_IS_ASCII(self)) 10324 return asciilib_rsplit_whitespace( 10325 self, PyUnicode_1BYTE_DATA(self), 10326 PyUnicode_GET_LENGTH(self), maxcount 10327 ); 10328 else 10329 return ucs1lib_rsplit_whitespace( 10330 self, PyUnicode_1BYTE_DATA(self), 10331 PyUnicode_GET_LENGTH(self), maxcount 10332 ); 10333 case PyUnicode_2BYTE_KIND: 10334 return ucs2lib_rsplit_whitespace( 10335 self, PyUnicode_2BYTE_DATA(self), 10336 PyUnicode_GET_LENGTH(self), maxcount 10337 ); 10338 case PyUnicode_4BYTE_KIND: 10339 return ucs4lib_rsplit_whitespace( 10340 self, PyUnicode_4BYTE_DATA(self), 10341 PyUnicode_GET_LENGTH(self), maxcount 10342 ); 10343 default: 10344 Py_UNREACHABLE(); 10345 } 10346 10347 if (PyUnicode_READY(substring) == -1) 10348 return NULL; 10349 10350 kind1 = PyUnicode_KIND(self); 10351 kind2 = PyUnicode_KIND(substring); 10352 len1 = PyUnicode_GET_LENGTH(self); 10353 len2 = PyUnicode_GET_LENGTH(substring); 10354 if (kind1 < kind2 || len1 < len2) { 10355 out = PyList_New(1); 10356 if (out == NULL) 10357 return NULL; 10358 Py_INCREF(self); 10359 PyList_SET_ITEM(out, 0, self); 10360 return out; 10361 } 10362 buf1 = PyUnicode_DATA(self); 10363 buf2 = PyUnicode_DATA(substring); 10364 if (kind2 != kind1) { 10365 buf2 = _PyUnicode_AsKind(substring, kind1); 10366 if (!buf2) 10367 return NULL; 10368 } 10369 10370 switch (kind1) { 10371 case PyUnicode_1BYTE_KIND: 10372 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10373 out = asciilib_rsplit( 10374 self, buf1, len1, buf2, len2, maxcount); 10375 else 10376 out = ucs1lib_rsplit( 10377 self, buf1, len1, buf2, len2, maxcount); 10378 break; 10379 case PyUnicode_2BYTE_KIND: 10380 out = ucs2lib_rsplit( 10381 self, buf1, len1, buf2, len2, maxcount); 10382 break; 10383 case PyUnicode_4BYTE_KIND: 10384 out = ucs4lib_rsplit( 10385 self, buf1, len1, buf2, len2, maxcount); 10386 break; 10387 default: 10388 out = NULL; 10389 } 10390 if (kind2 != kind1) 10391 PyMem_Free(buf2); 10392 return out; 10393 } 10394 10395 static Py_ssize_t 10396 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10397 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10398 { 10399 switch (kind) { 10400 case PyUnicode_1BYTE_KIND: 10401 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10402 return asciilib_find(buf1, len1, buf2, len2, offset); 10403 else 10404 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10405 case PyUnicode_2BYTE_KIND: 10406 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10407 case PyUnicode_4BYTE_KIND: 10408 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10409 } 10410 Py_UNREACHABLE(); 10411 } 10412 10413 static Py_ssize_t 10414 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10415 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10416 { 10417 switch (kind) { 10418 case PyUnicode_1BYTE_KIND: 10419 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10420 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10421 else 10422 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10423 case PyUnicode_2BYTE_KIND: 10424 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10425 case PyUnicode_4BYTE_KIND: 10426 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10427 } 10428 Py_UNREACHABLE(); 10429 } 10430 10431 static void 10432 replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10433 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10434 { 10435 int kind = PyUnicode_KIND(u); 10436 void *data = PyUnicode_DATA(u); 10437 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10438 if (kind == PyUnicode_1BYTE_KIND) { 10439 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10440 (Py_UCS1 *)data + len, 10441 u1, u2, maxcount); 10442 } 10443 else if (kind == PyUnicode_2BYTE_KIND) { 10444 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10445 (Py_UCS2 *)data + len, 10446 u1, u2, maxcount); 10447 } 10448 else { 10449 assert(kind == PyUnicode_4BYTE_KIND); 10450 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10451 (Py_UCS4 *)data + len, 10452 u1, u2, maxcount); 10453 } 10454 } 10455 10456 static PyObject * 10457 replace(PyObject *self, PyObject *str1, 10458 PyObject *str2, Py_ssize_t maxcount) 10459 { 10460 PyObject *u; 10461 char *sbuf = PyUnicode_DATA(self); 10462 char *buf1 = PyUnicode_DATA(str1); 10463 char *buf2 = PyUnicode_DATA(str2); 10464 int srelease = 0, release1 = 0, release2 = 0; 10465 int skind = PyUnicode_KIND(self); 10466 int kind1 = PyUnicode_KIND(str1); 10467 int kind2 = PyUnicode_KIND(str2); 10468 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10469 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10470 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10471 int mayshrink; 10472 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10473 10474 if (maxcount < 0) 10475 maxcount = PY_SSIZE_T_MAX; 10476 else if (maxcount == 0 || slen == 0) 10477 goto nothing; 10478 10479 if (str1 == str2) 10480 goto nothing; 10481 10482 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10483 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10484 if (maxchar < maxchar_str1) 10485 /* substring too wide to be present */ 10486 goto nothing; 10487 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10488 /* Replacing str1 with str2 may cause a maxchar reduction in the 10489 result string. */ 10490 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10491 maxchar = Py_MAX(maxchar, maxchar_str2); 10492 10493 if (len1 == len2) { 10494 /* same length */ 10495 if (len1 == 0) 10496 goto nothing; 10497 if (len1 == 1) { 10498 /* replace characters */ 10499 Py_UCS4 u1, u2; 10500 Py_ssize_t pos; 10501 10502 u1 = PyUnicode_READ(kind1, buf1, 0); 10503 pos = findchar(sbuf, skind, slen, u1, 1); 10504 if (pos < 0) 10505 goto nothing; 10506 u2 = PyUnicode_READ(kind2, buf2, 0); 10507 u = PyUnicode_New(slen, maxchar); 10508 if (!u) 10509 goto error; 10510 10511 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10512 replace_1char_inplace(u, pos, u1, u2, maxcount); 10513 } 10514 else { 10515 int rkind = skind; 10516 char *res; 10517 Py_ssize_t i; 10518 10519 if (kind1 < rkind) { 10520 /* widen substring */ 10521 buf1 = _PyUnicode_AsKind(str1, rkind); 10522 if (!buf1) goto error; 10523 release1 = 1; 10524 } 10525 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10526 if (i < 0) 10527 goto nothing; 10528 if (rkind > kind2) { 10529 /* widen replacement */ 10530 buf2 = _PyUnicode_AsKind(str2, rkind); 10531 if (!buf2) goto error; 10532 release2 = 1; 10533 } 10534 else if (rkind < kind2) { 10535 /* widen self and buf1 */ 10536 rkind = kind2; 10537 if (release1) PyMem_Free(buf1); 10538 release1 = 0; 10539 sbuf = _PyUnicode_AsKind(self, rkind); 10540 if (!sbuf) goto error; 10541 srelease = 1; 10542 buf1 = _PyUnicode_AsKind(str1, rkind); 10543 if (!buf1) goto error; 10544 release1 = 1; 10545 } 10546 u = PyUnicode_New(slen, maxchar); 10547 if (!u) 10548 goto error; 10549 assert(PyUnicode_KIND(u) == rkind); 10550 res = PyUnicode_DATA(u); 10551 10552 memcpy(res, sbuf, rkind * slen); 10553 /* change everything in-place, starting with this one */ 10554 memcpy(res + rkind * i, 10555 buf2, 10556 rkind * len2); 10557 i += len1; 10558 10559 while ( --maxcount > 0) { 10560 i = anylib_find(rkind, self, 10561 sbuf+rkind*i, slen-i, 10562 str1, buf1, len1, i); 10563 if (i == -1) 10564 break; 10565 memcpy(res + rkind * i, 10566 buf2, 10567 rkind * len2); 10568 i += len1; 10569 } 10570 } 10571 } 10572 else { 10573 Py_ssize_t n, i, j, ires; 10574 Py_ssize_t new_size; 10575 int rkind = skind; 10576 char *res; 10577 10578 if (kind1 < rkind) { 10579 /* widen substring */ 10580 buf1 = _PyUnicode_AsKind(str1, rkind); 10581 if (!buf1) goto error; 10582 release1 = 1; 10583 } 10584 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10585 if (n == 0) 10586 goto nothing; 10587 if (kind2 < rkind) { 10588 /* widen replacement */ 10589 buf2 = _PyUnicode_AsKind(str2, rkind); 10590 if (!buf2) goto error; 10591 release2 = 1; 10592 } 10593 else if (kind2 > rkind) { 10594 /* widen self and buf1 */ 10595 rkind = kind2; 10596 sbuf = _PyUnicode_AsKind(self, rkind); 10597 if (!sbuf) goto error; 10598 srelease = 1; 10599 if (release1) PyMem_Free(buf1); 10600 release1 = 0; 10601 buf1 = _PyUnicode_AsKind(str1, rkind); 10602 if (!buf1) goto error; 10603 release1 = 1; 10604 } 10605 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10606 PyUnicode_GET_LENGTH(str1))); */ 10607 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10608 PyErr_SetString(PyExc_OverflowError, 10609 "replace string is too long"); 10610 goto error; 10611 } 10612 new_size = slen + n * (len2 - len1); 10613 if (new_size == 0) { 10614 _Py_INCREF_UNICODE_EMPTY(); 10615 if (!unicode_empty) 10616 goto error; 10617 u = unicode_empty; 10618 goto done; 10619 } 10620 if (new_size > (PY_SSIZE_T_MAX / rkind)) { 10621 PyErr_SetString(PyExc_OverflowError, 10622 "replace string is too long"); 10623 goto error; 10624 } 10625 u = PyUnicode_New(new_size, maxchar); 10626 if (!u) 10627 goto error; 10628 assert(PyUnicode_KIND(u) == rkind); 10629 res = PyUnicode_DATA(u); 10630 ires = i = 0; 10631 if (len1 > 0) { 10632 while (n-- > 0) { 10633 /* look for next match */ 10634 j = anylib_find(rkind, self, 10635 sbuf + rkind * i, slen-i, 10636 str1, buf1, len1, i); 10637 if (j == -1) 10638 break; 10639 else if (j > i) { 10640 /* copy unchanged part [i:j] */ 10641 memcpy(res + rkind * ires, 10642 sbuf + rkind * i, 10643 rkind * (j-i)); 10644 ires += j - i; 10645 } 10646 /* copy substitution string */ 10647 if (len2 > 0) { 10648 memcpy(res + rkind * ires, 10649 buf2, 10650 rkind * len2); 10651 ires += len2; 10652 } 10653 i = j + len1; 10654 } 10655 if (i < slen) 10656 /* copy tail [i:] */ 10657 memcpy(res + rkind * ires, 10658 sbuf + rkind * i, 10659 rkind * (slen-i)); 10660 } 10661 else { 10662 /* interleave */ 10663 while (n > 0) { 10664 memcpy(res + rkind * ires, 10665 buf2, 10666 rkind * len2); 10667 ires += len2; 10668 if (--n <= 0) 10669 break; 10670 memcpy(res + rkind * ires, 10671 sbuf + rkind * i, 10672 rkind); 10673 ires++; 10674 i++; 10675 } 10676 memcpy(res + rkind * ires, 10677 sbuf + rkind * i, 10678 rkind * (slen-i)); 10679 } 10680 } 10681 10682 if (mayshrink) { 10683 unicode_adjust_maxchar(&u); 10684 if (u == NULL) 10685 goto error; 10686 } 10687 10688 done: 10689 if (srelease) 10690 PyMem_FREE(sbuf); 10691 if (release1) 10692 PyMem_FREE(buf1); 10693 if (release2) 10694 PyMem_FREE(buf2); 10695 assert(_PyUnicode_CheckConsistency(u, 1)); 10696 return u; 10697 10698 nothing: 10699 /* nothing to replace; return original string (when possible) */ 10700 if (srelease) 10701 PyMem_FREE(sbuf); 10702 if (release1) 10703 PyMem_FREE(buf1); 10704 if (release2) 10705 PyMem_FREE(buf2); 10706 return unicode_result_unchanged(self); 10707 10708 error: 10709 if (srelease && sbuf) 10710 PyMem_FREE(sbuf); 10711 if (release1 && buf1) 10712 PyMem_FREE(buf1); 10713 if (release2 && buf2) 10714 PyMem_FREE(buf2); 10715 return NULL; 10716 } 10717 10718 /* --- Unicode Object Methods --------------------------------------------- */ 10719 10720 /*[clinic input] 10721 str.title as unicode_title 10722 10723 Return a version of the string where each word is titlecased. 10724 10725 More specifically, words start with uppercased characters and all remaining 10726 cased characters have lower case. 10727 [clinic start generated code]*/ 10728 10729 static PyObject * 10730 unicode_title_impl(PyObject *self) 10731 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/ 10732 { 10733 if (PyUnicode_READY(self) == -1) 10734 return NULL; 10735 return case_operation(self, do_title); 10736 } 10737 10738 /*[clinic input] 10739 str.capitalize as unicode_capitalize 10740 10741 Return a capitalized version of the string. 10742 10743 More specifically, make the first character have upper case and the rest lower 10744 case. 10745 [clinic start generated code]*/ 10746 10747 static PyObject * 10748 unicode_capitalize_impl(PyObject *self) 10749 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/ 10750 { 10751 if (PyUnicode_READY(self) == -1) 10752 return NULL; 10753 if (PyUnicode_GET_LENGTH(self) == 0) 10754 return unicode_result_unchanged(self); 10755 return case_operation(self, do_capitalize); 10756 } 10757 10758 /*[clinic input] 10759 str.casefold as unicode_casefold 10760 10761 Return a version of the string suitable for caseless comparisons. 10762 [clinic start generated code]*/ 10763 10764 static PyObject * 10765 unicode_casefold_impl(PyObject *self) 10766 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/ 10767 { 10768 if (PyUnicode_READY(self) == -1) 10769 return NULL; 10770 if (PyUnicode_IS_ASCII(self)) 10771 return ascii_upper_or_lower(self, 1); 10772 return case_operation(self, do_casefold); 10773 } 10774 10775 10776 /* Argument converter. Accepts a single Unicode character. */ 10777 10778 static int 10779 convert_uc(PyObject *obj, void *addr) 10780 { 10781 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10782 10783 if (!PyUnicode_Check(obj)) { 10784 PyErr_Format(PyExc_TypeError, 10785 "The fill character must be a unicode character, " 10786 "not %.100s", Py_TYPE(obj)->tp_name); 10787 return 0; 10788 } 10789 if (PyUnicode_READY(obj) < 0) 10790 return 0; 10791 if (PyUnicode_GET_LENGTH(obj) != 1) { 10792 PyErr_SetString(PyExc_TypeError, 10793 "The fill character must be exactly one character long"); 10794 return 0; 10795 } 10796 *fillcharloc = PyUnicode_READ_CHAR(obj, 0); 10797 return 1; 10798 } 10799 10800 /*[clinic input] 10801 str.center as unicode_center 10802 10803 width: Py_ssize_t 10804 fillchar: Py_UCS4 = ' ' 10805 / 10806 10807 Return a centered string of length width. 10808 10809 Padding is done using the specified fill character (default is a space). 10810 [clinic start generated code]*/ 10811 10812 static PyObject * 10813 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 10814 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/ 10815 { 10816 Py_ssize_t marg, left; 10817 10818 if (PyUnicode_READY(self) == -1) 10819 return NULL; 10820 10821 if (PyUnicode_GET_LENGTH(self) >= width) 10822 return unicode_result_unchanged(self); 10823 10824 marg = width - PyUnicode_GET_LENGTH(self); 10825 left = marg / 2 + (marg & width & 1); 10826 10827 return pad(self, left, marg - left, fillchar); 10828 } 10829 10830 /* This function assumes that str1 and str2 are readied by the caller. */ 10831 10832 static int 10833 unicode_compare(PyObject *str1, PyObject *str2) 10834 { 10835 #define COMPARE(TYPE1, TYPE2) \ 10836 do { \ 10837 TYPE1* p1 = (TYPE1 *)data1; \ 10838 TYPE2* p2 = (TYPE2 *)data2; \ 10839 TYPE1* end = p1 + len; \ 10840 Py_UCS4 c1, c2; \ 10841 for (; p1 != end; p1++, p2++) { \ 10842 c1 = *p1; \ 10843 c2 = *p2; \ 10844 if (c1 != c2) \ 10845 return (c1 < c2) ? -1 : 1; \ 10846 } \ 10847 } \ 10848 while (0) 10849 10850 int kind1, kind2; 10851 void *data1, *data2; 10852 Py_ssize_t len1, len2, len; 10853 10854 kind1 = PyUnicode_KIND(str1); 10855 kind2 = PyUnicode_KIND(str2); 10856 data1 = PyUnicode_DATA(str1); 10857 data2 = PyUnicode_DATA(str2); 10858 len1 = PyUnicode_GET_LENGTH(str1); 10859 len2 = PyUnicode_GET_LENGTH(str2); 10860 len = Py_MIN(len1, len2); 10861 10862 switch(kind1) { 10863 case PyUnicode_1BYTE_KIND: 10864 { 10865 switch(kind2) { 10866 case PyUnicode_1BYTE_KIND: 10867 { 10868 int cmp = memcmp(data1, data2, len); 10869 /* normalize result of memcmp() into the range [-1; 1] */ 10870 if (cmp < 0) 10871 return -1; 10872 if (cmp > 0) 10873 return 1; 10874 break; 10875 } 10876 case PyUnicode_2BYTE_KIND: 10877 COMPARE(Py_UCS1, Py_UCS2); 10878 break; 10879 case PyUnicode_4BYTE_KIND: 10880 COMPARE(Py_UCS1, Py_UCS4); 10881 break; 10882 default: 10883 Py_UNREACHABLE(); 10884 } 10885 break; 10886 } 10887 case PyUnicode_2BYTE_KIND: 10888 { 10889 switch(kind2) { 10890 case PyUnicode_1BYTE_KIND: 10891 COMPARE(Py_UCS2, Py_UCS1); 10892 break; 10893 case PyUnicode_2BYTE_KIND: 10894 { 10895 COMPARE(Py_UCS2, Py_UCS2); 10896 break; 10897 } 10898 case PyUnicode_4BYTE_KIND: 10899 COMPARE(Py_UCS2, Py_UCS4); 10900 break; 10901 default: 10902 Py_UNREACHABLE(); 10903 } 10904 break; 10905 } 10906 case PyUnicode_4BYTE_KIND: 10907 { 10908 switch(kind2) { 10909 case PyUnicode_1BYTE_KIND: 10910 COMPARE(Py_UCS4, Py_UCS1); 10911 break; 10912 case PyUnicode_2BYTE_KIND: 10913 COMPARE(Py_UCS4, Py_UCS2); 10914 break; 10915 case PyUnicode_4BYTE_KIND: 10916 { 10917 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10918 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10919 /* normalize result of wmemcmp() into the range [-1; 1] */ 10920 if (cmp < 0) 10921 return -1; 10922 if (cmp > 0) 10923 return 1; 10924 #else 10925 COMPARE(Py_UCS4, Py_UCS4); 10926 #endif 10927 break; 10928 } 10929 default: 10930 Py_UNREACHABLE(); 10931 } 10932 break; 10933 } 10934 default: 10935 Py_UNREACHABLE(); 10936 } 10937 10938 if (len1 == len2) 10939 return 0; 10940 if (len1 < len2) 10941 return -1; 10942 else 10943 return 1; 10944 10945 #undef COMPARE 10946 } 10947 10948 static int 10949 unicode_compare_eq(PyObject *str1, PyObject *str2) 10950 { 10951 int kind; 10952 void *data1, *data2; 10953 Py_ssize_t len; 10954 int cmp; 10955 10956 len = PyUnicode_GET_LENGTH(str1); 10957 if (PyUnicode_GET_LENGTH(str2) != len) 10958 return 0; 10959 kind = PyUnicode_KIND(str1); 10960 if (PyUnicode_KIND(str2) != kind) 10961 return 0; 10962 data1 = PyUnicode_DATA(str1); 10963 data2 = PyUnicode_DATA(str2); 10964 10965 cmp = memcmp(data1, data2, len * kind); 10966 return (cmp == 0); 10967 } 10968 10969 10970 int 10971 PyUnicode_Compare(PyObject *left, PyObject *right) 10972 { 10973 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10974 if (PyUnicode_READY(left) == -1 || 10975 PyUnicode_READY(right) == -1) 10976 return -1; 10977 10978 /* a string is equal to itself */ 10979 if (left == right) 10980 return 0; 10981 10982 return unicode_compare(left, right); 10983 } 10984 PyErr_Format(PyExc_TypeError, 10985 "Can't compare %.100s and %.100s", 10986 left->ob_type->tp_name, 10987 right->ob_type->tp_name); 10988 return -1; 10989 } 10990 10991 int 10992 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10993 { 10994 Py_ssize_t i; 10995 int kind; 10996 Py_UCS4 chr; 10997 const unsigned char *ustr = (const unsigned char *)str; 10998 10999 assert(_PyUnicode_CHECK(uni)); 11000 if (!PyUnicode_IS_READY(uni)) { 11001 const wchar_t *ws = _PyUnicode_WSTR(uni); 11002 /* Compare Unicode string and source character set string */ 11003 for (i = 0; (chr = ws[i]) && ustr[i]; i++) { 11004 if (chr != ustr[i]) 11005 return (chr < ustr[i]) ? -1 : 1; 11006 } 11007 /* This check keeps Python strings that end in '\0' from comparing equal 11008 to C strings identical up to that point. */ 11009 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr) 11010 return 1; /* uni is longer */ 11011 if (ustr[i]) 11012 return -1; /* str is longer */ 11013 return 0; 11014 } 11015 kind = PyUnicode_KIND(uni); 11016 if (kind == PyUnicode_1BYTE_KIND) { 11017 const void *data = PyUnicode_1BYTE_DATA(uni); 11018 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 11019 size_t len, len2 = strlen(str); 11020 int cmp; 11021 11022 len = Py_MIN(len1, len2); 11023 cmp = memcmp(data, str, len); 11024 if (cmp != 0) { 11025 if (cmp < 0) 11026 return -1; 11027 else 11028 return 1; 11029 } 11030 if (len1 > len2) 11031 return 1; /* uni is longer */ 11032 if (len1 < len2) 11033 return -1; /* str is longer */ 11034 return 0; 11035 } 11036 else { 11037 void *data = PyUnicode_DATA(uni); 11038 /* Compare Unicode string and source character set string */ 11039 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 11040 if (chr != (unsigned char)str[i]) 11041 return (chr < (unsigned char)(str[i])) ? -1 : 1; 11042 /* This check keeps Python strings that end in '\0' from comparing equal 11043 to C strings identical up to that point. */ 11044 if (PyUnicode_GET_LENGTH(uni) != i || chr) 11045 return 1; /* uni is longer */ 11046 if (str[i]) 11047 return -1; /* str is longer */ 11048 return 0; 11049 } 11050 } 11051 11052 static int 11053 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) 11054 { 11055 size_t i, len; 11056 const wchar_t *p; 11057 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); 11058 if (strlen(str) != len) 11059 return 0; 11060 p = _PyUnicode_WSTR(unicode); 11061 assert(p); 11062 for (i = 0; i < len; i++) { 11063 unsigned char c = (unsigned char)str[i]; 11064 if (c >= 128 || p[i] != (wchar_t)c) 11065 return 0; 11066 } 11067 return 1; 11068 } 11069 11070 int 11071 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) 11072 { 11073 size_t len; 11074 assert(_PyUnicode_CHECK(unicode)); 11075 assert(str); 11076 #ifndef NDEBUG 11077 for (const char *p = str; *p; p++) { 11078 assert((unsigned char)*p < 128); 11079 } 11080 #endif 11081 if (PyUnicode_READY(unicode) == -1) { 11082 /* Memory error or bad data */ 11083 PyErr_Clear(); 11084 return non_ready_unicode_equal_to_ascii_string(unicode, str); 11085 } 11086 if (!PyUnicode_IS_ASCII(unicode)) 11087 return 0; 11088 len = (size_t)PyUnicode_GET_LENGTH(unicode); 11089 return strlen(str) == len && 11090 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0; 11091 } 11092 11093 int 11094 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) 11095 { 11096 PyObject *right_uni; 11097 Py_hash_t hash; 11098 11099 assert(_PyUnicode_CHECK(left)); 11100 assert(right->string); 11101 #ifndef NDEBUG 11102 for (const char *p = right->string; *p; p++) { 11103 assert((unsigned char)*p < 128); 11104 } 11105 #endif 11106 11107 if (PyUnicode_READY(left) == -1) { 11108 /* memory error or bad data */ 11109 PyErr_Clear(); 11110 return non_ready_unicode_equal_to_ascii_string(left, right->string); 11111 } 11112 11113 if (!PyUnicode_IS_ASCII(left)) 11114 return 0; 11115 11116 right_uni = _PyUnicode_FromId(right); /* borrowed */ 11117 if (right_uni == NULL) { 11118 /* memory error or bad data */ 11119 PyErr_Clear(); 11120 return _PyUnicode_EqualToASCIIString(left, right->string); 11121 } 11122 11123 if (left == right_uni) 11124 return 1; 11125 11126 if (PyUnicode_CHECK_INTERNED(left)) 11127 return 0; 11128 11129 assert(_PyUnicode_HASH(right_uni) != -1); 11130 hash = _PyUnicode_HASH(left); 11131 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) 11132 return 0; 11133 11134 return unicode_compare_eq(left, right_uni); 11135 } 11136 11137 PyObject * 11138 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 11139 { 11140 int result; 11141 11142 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 11143 Py_RETURN_NOTIMPLEMENTED; 11144 11145 if (PyUnicode_READY(left) == -1 || 11146 PyUnicode_READY(right) == -1) 11147 return NULL; 11148 11149 if (left == right) { 11150 switch (op) { 11151 case Py_EQ: 11152 case Py_LE: 11153 case Py_GE: 11154 /* a string is equal to itself */ 11155 Py_RETURN_TRUE; 11156 case Py_NE: 11157 case Py_LT: 11158 case Py_GT: 11159 Py_RETURN_FALSE; 11160 default: 11161 PyErr_BadArgument(); 11162 return NULL; 11163 } 11164 } 11165 else if (op == Py_EQ || op == Py_NE) { 11166 result = unicode_compare_eq(left, right); 11167 result ^= (op == Py_NE); 11168 return PyBool_FromLong(result); 11169 } 11170 else { 11171 result = unicode_compare(left, right); 11172 Py_RETURN_RICHCOMPARE(result, 0, op); 11173 } 11174 } 11175 11176 int 11177 _PyUnicode_EQ(PyObject *aa, PyObject *bb) 11178 { 11179 return unicode_eq(aa, bb); 11180 } 11181 11182 int 11183 PyUnicode_Contains(PyObject *str, PyObject *substr) 11184 { 11185 int kind1, kind2; 11186 void *buf1, *buf2; 11187 Py_ssize_t len1, len2; 11188 int result; 11189 11190 if (!PyUnicode_Check(substr)) { 11191 PyErr_Format(PyExc_TypeError, 11192 "'in <string>' requires string as left operand, not %.100s", 11193 Py_TYPE(substr)->tp_name); 11194 return -1; 11195 } 11196 if (PyUnicode_READY(substr) == -1) 11197 return -1; 11198 if (ensure_unicode(str) < 0) 11199 return -1; 11200 11201 kind1 = PyUnicode_KIND(str); 11202 kind2 = PyUnicode_KIND(substr); 11203 if (kind1 < kind2) 11204 return 0; 11205 len1 = PyUnicode_GET_LENGTH(str); 11206 len2 = PyUnicode_GET_LENGTH(substr); 11207 if (len1 < len2) 11208 return 0; 11209 buf1 = PyUnicode_DATA(str); 11210 buf2 = PyUnicode_DATA(substr); 11211 if (len2 == 1) { 11212 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 11213 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 11214 return result; 11215 } 11216 if (kind2 != kind1) { 11217 buf2 = _PyUnicode_AsKind(substr, kind1); 11218 if (!buf2) 11219 return -1; 11220 } 11221 11222 switch (kind1) { 11223 case PyUnicode_1BYTE_KIND: 11224 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11225 break; 11226 case PyUnicode_2BYTE_KIND: 11227 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11228 break; 11229 case PyUnicode_4BYTE_KIND: 11230 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11231 break; 11232 default: 11233 Py_UNREACHABLE(); 11234 } 11235 11236 if (kind2 != kind1) 11237 PyMem_Free(buf2); 11238 11239 return result; 11240 } 11241 11242 /* Concat to string or Unicode object giving a new Unicode object. */ 11243 11244 PyObject * 11245 PyUnicode_Concat(PyObject *left, PyObject *right) 11246 { 11247 PyObject *result; 11248 Py_UCS4 maxchar, maxchar2; 11249 Py_ssize_t left_len, right_len, new_len; 11250 11251 if (ensure_unicode(left) < 0) 11252 return NULL; 11253 11254 if (!PyUnicode_Check(right)) { 11255 PyErr_Format(PyExc_TypeError, 11256 "can only concatenate str (not \"%.200s\") to str", 11257 right->ob_type->tp_name); 11258 return NULL; 11259 } 11260 if (PyUnicode_READY(right) < 0) 11261 return NULL; 11262 11263 /* Shortcuts */ 11264 if (left == unicode_empty) 11265 return PyUnicode_FromObject(right); 11266 if (right == unicode_empty) 11267 return PyUnicode_FromObject(left); 11268 11269 left_len = PyUnicode_GET_LENGTH(left); 11270 right_len = PyUnicode_GET_LENGTH(right); 11271 if (left_len > PY_SSIZE_T_MAX - right_len) { 11272 PyErr_SetString(PyExc_OverflowError, 11273 "strings are too large to concat"); 11274 return NULL; 11275 } 11276 new_len = left_len + right_len; 11277 11278 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11279 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11280 maxchar = Py_MAX(maxchar, maxchar2); 11281 11282 /* Concat the two Unicode strings */ 11283 result = PyUnicode_New(new_len, maxchar); 11284 if (result == NULL) 11285 return NULL; 11286 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); 11287 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); 11288 assert(_PyUnicode_CheckConsistency(result, 1)); 11289 return result; 11290 } 11291 11292 void 11293 PyUnicode_Append(PyObject **p_left, PyObject *right) 11294 { 11295 PyObject *left, *res; 11296 Py_UCS4 maxchar, maxchar2; 11297 Py_ssize_t left_len, right_len, new_len; 11298 11299 if (p_left == NULL) { 11300 if (!PyErr_Occurred()) 11301 PyErr_BadInternalCall(); 11302 return; 11303 } 11304 left = *p_left; 11305 if (right == NULL || left == NULL 11306 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11307 if (!PyErr_Occurred()) 11308 PyErr_BadInternalCall(); 11309 goto error; 11310 } 11311 11312 if (PyUnicode_READY(left) == -1) 11313 goto error; 11314 if (PyUnicode_READY(right) == -1) 11315 goto error; 11316 11317 /* Shortcuts */ 11318 if (left == unicode_empty) { 11319 Py_DECREF(left); 11320 Py_INCREF(right); 11321 *p_left = right; 11322 return; 11323 } 11324 if (right == unicode_empty) 11325 return; 11326 11327 left_len = PyUnicode_GET_LENGTH(left); 11328 right_len = PyUnicode_GET_LENGTH(right); 11329 if (left_len > PY_SSIZE_T_MAX - right_len) { 11330 PyErr_SetString(PyExc_OverflowError, 11331 "strings are too large to concat"); 11332 goto error; 11333 } 11334 new_len = left_len + right_len; 11335 11336 if (unicode_modifiable(left) 11337 && PyUnicode_CheckExact(right) 11338 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11339 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11340 to change the structure size, but characters are stored just after 11341 the structure, and so it requires to move all characters which is 11342 not so different than duplicating the string. */ 11343 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11344 { 11345 /* append inplace */ 11346 if (unicode_resize(p_left, new_len) != 0) 11347 goto error; 11348 11349 /* copy 'right' into the newly allocated area of 'left' */ 11350 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11351 } 11352 else { 11353 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11354 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11355 maxchar = Py_MAX(maxchar, maxchar2); 11356 11357 /* Concat the two Unicode strings */ 11358 res = PyUnicode_New(new_len, maxchar); 11359 if (res == NULL) 11360 goto error; 11361 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11362 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11363 Py_DECREF(left); 11364 *p_left = res; 11365 } 11366 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11367 return; 11368 11369 error: 11370 Py_CLEAR(*p_left); 11371 } 11372 11373 void 11374 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11375 { 11376 PyUnicode_Append(pleft, right); 11377 Py_XDECREF(right); 11378 } 11379 11380 /* 11381 Wraps stringlib_parse_args_finds() and additionally ensures that the 11382 first argument is a unicode object. 11383 */ 11384 11385 static inline int 11386 parse_args_finds_unicode(const char * function_name, PyObject *args, 11387 PyObject **substring, 11388 Py_ssize_t *start, Py_ssize_t *end) 11389 { 11390 if(stringlib_parse_args_finds(function_name, args, substring, 11391 start, end)) { 11392 if (ensure_unicode(*substring) < 0) 11393 return 0; 11394 return 1; 11395 } 11396 return 0; 11397 } 11398 11399 PyDoc_STRVAR(count__doc__, 11400 "S.count(sub[, start[, end]]) -> int\n\ 11401 \n\ 11402 Return the number of non-overlapping occurrences of substring sub in\n\ 11403 string S[start:end]. Optional arguments start and end are\n\ 11404 interpreted as in slice notation."); 11405 11406 static PyObject * 11407 unicode_count(PyObject *self, PyObject *args) 11408 { 11409 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11410 Py_ssize_t start = 0; 11411 Py_ssize_t end = PY_SSIZE_T_MAX; 11412 PyObject *result; 11413 int kind1, kind2; 11414 void *buf1, *buf2; 11415 Py_ssize_t len1, len2, iresult; 11416 11417 if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 11418 return NULL; 11419 11420 kind1 = PyUnicode_KIND(self); 11421 kind2 = PyUnicode_KIND(substring); 11422 if (kind1 < kind2) 11423 return PyLong_FromLong(0); 11424 11425 len1 = PyUnicode_GET_LENGTH(self); 11426 len2 = PyUnicode_GET_LENGTH(substring); 11427 ADJUST_INDICES(start, end, len1); 11428 if (end - start < len2) 11429 return PyLong_FromLong(0); 11430 11431 buf1 = PyUnicode_DATA(self); 11432 buf2 = PyUnicode_DATA(substring); 11433 if (kind2 != kind1) { 11434 buf2 = _PyUnicode_AsKind(substring, kind1); 11435 if (!buf2) 11436 return NULL; 11437 } 11438 switch (kind1) { 11439 case PyUnicode_1BYTE_KIND: 11440 iresult = ucs1lib_count( 11441 ((Py_UCS1*)buf1) + start, end - start, 11442 buf2, len2, PY_SSIZE_T_MAX 11443 ); 11444 break; 11445 case PyUnicode_2BYTE_KIND: 11446 iresult = ucs2lib_count( 11447 ((Py_UCS2*)buf1) + start, end - start, 11448 buf2, len2, PY_SSIZE_T_MAX 11449 ); 11450 break; 11451 case PyUnicode_4BYTE_KIND: 11452 iresult = ucs4lib_count( 11453 ((Py_UCS4*)buf1) + start, end - start, 11454 buf2, len2, PY_SSIZE_T_MAX 11455 ); 11456 break; 11457 default: 11458 Py_UNREACHABLE(); 11459 } 11460 11461 result = PyLong_FromSsize_t(iresult); 11462 11463 if (kind2 != kind1) 11464 PyMem_Free(buf2); 11465 11466 return result; 11467 } 11468 11469 /*[clinic input] 11470 str.encode as unicode_encode 11471 11472 encoding: str(c_default="NULL") = 'utf-8' 11473 The encoding in which to encode the string. 11474 errors: str(c_default="NULL") = 'strict' 11475 The error handling scheme to use for encoding errors. 11476 The default is 'strict' meaning that encoding errors raise a 11477 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 11478 'xmlcharrefreplace' as well as any other name registered with 11479 codecs.register_error that can handle UnicodeEncodeErrors. 11480 11481 Encode the string using the codec registered for encoding. 11482 [clinic start generated code]*/ 11483 11484 static PyObject * 11485 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors) 11486 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/ 11487 { 11488 return PyUnicode_AsEncodedString(self, encoding, errors); 11489 } 11490 11491 /*[clinic input] 11492 str.expandtabs as unicode_expandtabs 11493 11494 tabsize: int = 8 11495 11496 Return a copy where all tab characters are expanded using spaces. 11497 11498 If tabsize is not given, a tab size of 8 characters is assumed. 11499 [clinic start generated code]*/ 11500 11501 static PyObject * 11502 unicode_expandtabs_impl(PyObject *self, int tabsize) 11503 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/ 11504 { 11505 Py_ssize_t i, j, line_pos, src_len, incr; 11506 Py_UCS4 ch; 11507 PyObject *u; 11508 void *src_data, *dest_data; 11509 int kind; 11510 int found; 11511 11512 if (PyUnicode_READY(self) == -1) 11513 return NULL; 11514 11515 /* First pass: determine size of output string */ 11516 src_len = PyUnicode_GET_LENGTH(self); 11517 i = j = line_pos = 0; 11518 kind = PyUnicode_KIND(self); 11519 src_data = PyUnicode_DATA(self); 11520 found = 0; 11521 for (; i < src_len; i++) { 11522 ch = PyUnicode_READ(kind, src_data, i); 11523 if (ch == '\t') { 11524 found = 1; 11525 if (tabsize > 0) { 11526 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11527 if (j > PY_SSIZE_T_MAX - incr) 11528 goto overflow; 11529 line_pos += incr; 11530 j += incr; 11531 } 11532 } 11533 else { 11534 if (j > PY_SSIZE_T_MAX - 1) 11535 goto overflow; 11536 line_pos++; 11537 j++; 11538 if (ch == '\n' || ch == '\r') 11539 line_pos = 0; 11540 } 11541 } 11542 if (!found) 11543 return unicode_result_unchanged(self); 11544 11545 /* Second pass: create output string and fill it */ 11546 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11547 if (!u) 11548 return NULL; 11549 dest_data = PyUnicode_DATA(u); 11550 11551 i = j = line_pos = 0; 11552 11553 for (; i < src_len; i++) { 11554 ch = PyUnicode_READ(kind, src_data, i); 11555 if (ch == '\t') { 11556 if (tabsize > 0) { 11557 incr = tabsize - (line_pos % tabsize); 11558 line_pos += incr; 11559 FILL(kind, dest_data, ' ', j, incr); 11560 j += incr; 11561 } 11562 } 11563 else { 11564 line_pos++; 11565 PyUnicode_WRITE(kind, dest_data, j, ch); 11566 j++; 11567 if (ch == '\n' || ch == '\r') 11568 line_pos = 0; 11569 } 11570 } 11571 assert (j == PyUnicode_GET_LENGTH(u)); 11572 return unicode_result(u); 11573 11574 overflow: 11575 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11576 return NULL; 11577 } 11578 11579 PyDoc_STRVAR(find__doc__, 11580 "S.find(sub[, start[, end]]) -> int\n\ 11581 \n\ 11582 Return the lowest index in S where substring sub is found,\n\ 11583 such that sub is contained within S[start:end]. Optional\n\ 11584 arguments start and end are interpreted as in slice notation.\n\ 11585 \n\ 11586 Return -1 on failure."); 11587 11588 static PyObject * 11589 unicode_find(PyObject *self, PyObject *args) 11590 { 11591 /* initialize variables to prevent gcc warning */ 11592 PyObject *substring = NULL; 11593 Py_ssize_t start = 0; 11594 Py_ssize_t end = 0; 11595 Py_ssize_t result; 11596 11597 if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) 11598 return NULL; 11599 11600 if (PyUnicode_READY(self) == -1) 11601 return NULL; 11602 11603 result = any_find_slice(self, substring, start, end, 1); 11604 11605 if (result == -2) 11606 return NULL; 11607 11608 return PyLong_FromSsize_t(result); 11609 } 11610 11611 static PyObject * 11612 unicode_getitem(PyObject *self, Py_ssize_t index) 11613 { 11614 void *data; 11615 enum PyUnicode_Kind kind; 11616 Py_UCS4 ch; 11617 11618 if (!PyUnicode_Check(self)) { 11619 PyErr_BadArgument(); 11620 return NULL; 11621 } 11622 if (PyUnicode_READY(self) == -1) { 11623 return NULL; 11624 } 11625 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11626 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11627 return NULL; 11628 } 11629 kind = PyUnicode_KIND(self); 11630 data = PyUnicode_DATA(self); 11631 ch = PyUnicode_READ(kind, data, index); 11632 return unicode_char(ch); 11633 } 11634 11635 /* Believe it or not, this produces the same value for ASCII strings 11636 as bytes_hash(). */ 11637 static Py_hash_t 11638 unicode_hash(PyObject *self) 11639 { 11640 Py_ssize_t len; 11641 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11642 11643 #ifdef Py_DEBUG 11644 assert(_Py_HashSecret_Initialized); 11645 #endif 11646 if (_PyUnicode_HASH(self) != -1) 11647 return _PyUnicode_HASH(self); 11648 if (PyUnicode_READY(self) == -1) 11649 return -1; 11650 len = PyUnicode_GET_LENGTH(self); 11651 /* 11652 We make the hash of the empty string be 0, rather than using 11653 (prefix ^ suffix), since this slightly obfuscates the hash secret 11654 */ 11655 if (len == 0) { 11656 _PyUnicode_HASH(self) = 0; 11657 return 0; 11658 } 11659 x = _Py_HashBytes(PyUnicode_DATA(self), 11660 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11661 _PyUnicode_HASH(self) = x; 11662 return x; 11663 } 11664 11665 PyDoc_STRVAR(index__doc__, 11666 "S.index(sub[, start[, end]]) -> int\n\ 11667 \n\ 11668 Return the lowest index in S where substring sub is found, \n\ 11669 such that sub is contained within S[start:end]. Optional\n\ 11670 arguments start and end are interpreted as in slice notation.\n\ 11671 \n\ 11672 Raises ValueError when the substring is not found."); 11673 11674 static PyObject * 11675 unicode_index(PyObject *self, PyObject *args) 11676 { 11677 /* initialize variables to prevent gcc warning */ 11678 Py_ssize_t result; 11679 PyObject *substring = NULL; 11680 Py_ssize_t start = 0; 11681 Py_ssize_t end = 0; 11682 11683 if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) 11684 return NULL; 11685 11686 if (PyUnicode_READY(self) == -1) 11687 return NULL; 11688 11689 result = any_find_slice(self, substring, start, end, 1); 11690 11691 if (result == -2) 11692 return NULL; 11693 11694 if (result < 0) { 11695 PyErr_SetString(PyExc_ValueError, "substring not found"); 11696 return NULL; 11697 } 11698 11699 return PyLong_FromSsize_t(result); 11700 } 11701 11702 /*[clinic input] 11703 str.isascii as unicode_isascii 11704 11705 Return True if all characters in the string are ASCII, False otherwise. 11706 11707 ASCII characters have code points in the range U+0000-U+007F. 11708 Empty string is ASCII too. 11709 [clinic start generated code]*/ 11710 11711 static PyObject * 11712 unicode_isascii_impl(PyObject *self) 11713 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/ 11714 { 11715 if (PyUnicode_READY(self) == -1) { 11716 return NULL; 11717 } 11718 return PyBool_FromLong(PyUnicode_IS_ASCII(self)); 11719 } 11720 11721 /*[clinic input] 11722 str.islower as unicode_islower 11723 11724 Return True if the string is a lowercase string, False otherwise. 11725 11726 A string is lowercase if all cased characters in the string are lowercase and 11727 there is at least one cased character in the string. 11728 [clinic start generated code]*/ 11729 11730 static PyObject * 11731 unicode_islower_impl(PyObject *self) 11732 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/ 11733 { 11734 Py_ssize_t i, length; 11735 int kind; 11736 void *data; 11737 int cased; 11738 11739 if (PyUnicode_READY(self) == -1) 11740 return NULL; 11741 length = PyUnicode_GET_LENGTH(self); 11742 kind = PyUnicode_KIND(self); 11743 data = PyUnicode_DATA(self); 11744 11745 /* Shortcut for single character strings */ 11746 if (length == 1) 11747 return PyBool_FromLong( 11748 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11749 11750 /* Special case for empty strings */ 11751 if (length == 0) 11752 Py_RETURN_FALSE; 11753 11754 cased = 0; 11755 for (i = 0; i < length; i++) { 11756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11757 11758 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11759 Py_RETURN_FALSE; 11760 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11761 cased = 1; 11762 } 11763 return PyBool_FromLong(cased); 11764 } 11765 11766 /*[clinic input] 11767 str.isupper as unicode_isupper 11768 11769 Return True if the string is an uppercase string, False otherwise. 11770 11771 A string is uppercase if all cased characters in the string are uppercase and 11772 there is at least one cased character in the string. 11773 [clinic start generated code]*/ 11774 11775 static PyObject * 11776 unicode_isupper_impl(PyObject *self) 11777 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/ 11778 { 11779 Py_ssize_t i, length; 11780 int kind; 11781 void *data; 11782 int cased; 11783 11784 if (PyUnicode_READY(self) == -1) 11785 return NULL; 11786 length = PyUnicode_GET_LENGTH(self); 11787 kind = PyUnicode_KIND(self); 11788 data = PyUnicode_DATA(self); 11789 11790 /* Shortcut for single character strings */ 11791 if (length == 1) 11792 return PyBool_FromLong( 11793 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11794 11795 /* Special case for empty strings */ 11796 if (length == 0) 11797 Py_RETURN_FALSE; 11798 11799 cased = 0; 11800 for (i = 0; i < length; i++) { 11801 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11802 11803 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11804 Py_RETURN_FALSE; 11805 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11806 cased = 1; 11807 } 11808 return PyBool_FromLong(cased); 11809 } 11810 11811 /*[clinic input] 11812 str.istitle as unicode_istitle 11813 11814 Return True if the string is a title-cased string, False otherwise. 11815 11816 In a title-cased string, upper- and title-case characters may only 11817 follow uncased characters and lowercase characters only cased ones. 11818 [clinic start generated code]*/ 11819 11820 static PyObject * 11821 unicode_istitle_impl(PyObject *self) 11822 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/ 11823 { 11824 Py_ssize_t i, length; 11825 int kind; 11826 void *data; 11827 int cased, previous_is_cased; 11828 11829 if (PyUnicode_READY(self) == -1) 11830 return NULL; 11831 length = PyUnicode_GET_LENGTH(self); 11832 kind = PyUnicode_KIND(self); 11833 data = PyUnicode_DATA(self); 11834 11835 /* Shortcut for single character strings */ 11836 if (length == 1) { 11837 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11838 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11839 (Py_UNICODE_ISUPPER(ch) != 0)); 11840 } 11841 11842 /* Special case for empty strings */ 11843 if (length == 0) 11844 Py_RETURN_FALSE; 11845 11846 cased = 0; 11847 previous_is_cased = 0; 11848 for (i = 0; i < length; i++) { 11849 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11850 11851 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11852 if (previous_is_cased) 11853 Py_RETURN_FALSE; 11854 previous_is_cased = 1; 11855 cased = 1; 11856 } 11857 else if (Py_UNICODE_ISLOWER(ch)) { 11858 if (!previous_is_cased) 11859 Py_RETURN_FALSE; 11860 previous_is_cased = 1; 11861 cased = 1; 11862 } 11863 else 11864 previous_is_cased = 0; 11865 } 11866 return PyBool_FromLong(cased); 11867 } 11868 11869 /*[clinic input] 11870 str.isspace as unicode_isspace 11871 11872 Return True if the string is a whitespace string, False otherwise. 11873 11874 A string is whitespace if all characters in the string are whitespace and there 11875 is at least one character in the string. 11876 [clinic start generated code]*/ 11877 11878 static PyObject * 11879 unicode_isspace_impl(PyObject *self) 11880 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/ 11881 { 11882 Py_ssize_t i, length; 11883 int kind; 11884 void *data; 11885 11886 if (PyUnicode_READY(self) == -1) 11887 return NULL; 11888 length = PyUnicode_GET_LENGTH(self); 11889 kind = PyUnicode_KIND(self); 11890 data = PyUnicode_DATA(self); 11891 11892 /* Shortcut for single character strings */ 11893 if (length == 1) 11894 return PyBool_FromLong( 11895 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11896 11897 /* Special case for empty strings */ 11898 if (length == 0) 11899 Py_RETURN_FALSE; 11900 11901 for (i = 0; i < length; i++) { 11902 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11903 if (!Py_UNICODE_ISSPACE(ch)) 11904 Py_RETURN_FALSE; 11905 } 11906 Py_RETURN_TRUE; 11907 } 11908 11909 /*[clinic input] 11910 str.isalpha as unicode_isalpha 11911 11912 Return True if the string is an alphabetic string, False otherwise. 11913 11914 A string is alphabetic if all characters in the string are alphabetic and there 11915 is at least one character in the string. 11916 [clinic start generated code]*/ 11917 11918 static PyObject * 11919 unicode_isalpha_impl(PyObject *self) 11920 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/ 11921 { 11922 Py_ssize_t i, length; 11923 int kind; 11924 void *data; 11925 11926 if (PyUnicode_READY(self) == -1) 11927 return NULL; 11928 length = PyUnicode_GET_LENGTH(self); 11929 kind = PyUnicode_KIND(self); 11930 data = PyUnicode_DATA(self); 11931 11932 /* Shortcut for single character strings */ 11933 if (length == 1) 11934 return PyBool_FromLong( 11935 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11936 11937 /* Special case for empty strings */ 11938 if (length == 0) 11939 Py_RETURN_FALSE; 11940 11941 for (i = 0; i < length; i++) { 11942 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11943 Py_RETURN_FALSE; 11944 } 11945 Py_RETURN_TRUE; 11946 } 11947 11948 /*[clinic input] 11949 str.isalnum as unicode_isalnum 11950 11951 Return True if the string is an alpha-numeric string, False otherwise. 11952 11953 A string is alpha-numeric if all characters in the string are alpha-numeric and 11954 there is at least one character in the string. 11955 [clinic start generated code]*/ 11956 11957 static PyObject * 11958 unicode_isalnum_impl(PyObject *self) 11959 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/ 11960 { 11961 int kind; 11962 void *data; 11963 Py_ssize_t len, i; 11964 11965 if (PyUnicode_READY(self) == -1) 11966 return NULL; 11967 11968 kind = PyUnicode_KIND(self); 11969 data = PyUnicode_DATA(self); 11970 len = PyUnicode_GET_LENGTH(self); 11971 11972 /* Shortcut for single character strings */ 11973 if (len == 1) { 11974 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11975 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11976 } 11977 11978 /* Special case for empty strings */ 11979 if (len == 0) 11980 Py_RETURN_FALSE; 11981 11982 for (i = 0; i < len; i++) { 11983 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11984 if (!Py_UNICODE_ISALNUM(ch)) 11985 Py_RETURN_FALSE; 11986 } 11987 Py_RETURN_TRUE; 11988 } 11989 11990 /*[clinic input] 11991 str.isdecimal as unicode_isdecimal 11992 11993 Return True if the string is a decimal string, False otherwise. 11994 11995 A string is a decimal string if all characters in the string are decimal and 11996 there is at least one character in the string. 11997 [clinic start generated code]*/ 11998 11999 static PyObject * 12000 unicode_isdecimal_impl(PyObject *self) 12001 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/ 12002 { 12003 Py_ssize_t i, length; 12004 int kind; 12005 void *data; 12006 12007 if (PyUnicode_READY(self) == -1) 12008 return NULL; 12009 length = PyUnicode_GET_LENGTH(self); 12010 kind = PyUnicode_KIND(self); 12011 data = PyUnicode_DATA(self); 12012 12013 /* Shortcut for single character strings */ 12014 if (length == 1) 12015 return PyBool_FromLong( 12016 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 12017 12018 /* Special case for empty strings */ 12019 if (length == 0) 12020 Py_RETURN_FALSE; 12021 12022 for (i = 0; i < length; i++) { 12023 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 12024 Py_RETURN_FALSE; 12025 } 12026 Py_RETURN_TRUE; 12027 } 12028 12029 /*[clinic input] 12030 str.isdigit as unicode_isdigit 12031 12032 Return True if the string is a digit string, False otherwise. 12033 12034 A string is a digit string if all characters in the string are digits and there 12035 is at least one character in the string. 12036 [clinic start generated code]*/ 12037 12038 static PyObject * 12039 unicode_isdigit_impl(PyObject *self) 12040 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/ 12041 { 12042 Py_ssize_t i, length; 12043 int kind; 12044 void *data; 12045 12046 if (PyUnicode_READY(self) == -1) 12047 return NULL; 12048 length = PyUnicode_GET_LENGTH(self); 12049 kind = PyUnicode_KIND(self); 12050 data = PyUnicode_DATA(self); 12051 12052 /* Shortcut for single character strings */ 12053 if (length == 1) { 12054 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 12055 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 12056 } 12057 12058 /* Special case for empty strings */ 12059 if (length == 0) 12060 Py_RETURN_FALSE; 12061 12062 for (i = 0; i < length; i++) { 12063 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 12064 Py_RETURN_FALSE; 12065 } 12066 Py_RETURN_TRUE; 12067 } 12068 12069 /*[clinic input] 12070 str.isnumeric as unicode_isnumeric 12071 12072 Return True if the string is a numeric string, False otherwise. 12073 12074 A string is numeric if all characters in the string are numeric and there is at 12075 least one character in the string. 12076 [clinic start generated code]*/ 12077 12078 static PyObject * 12079 unicode_isnumeric_impl(PyObject *self) 12080 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/ 12081 { 12082 Py_ssize_t i, length; 12083 int kind; 12084 void *data; 12085 12086 if (PyUnicode_READY(self) == -1) 12087 return NULL; 12088 length = PyUnicode_GET_LENGTH(self); 12089 kind = PyUnicode_KIND(self); 12090 data = PyUnicode_DATA(self); 12091 12092 /* Shortcut for single character strings */ 12093 if (length == 1) 12094 return PyBool_FromLong( 12095 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 12096 12097 /* Special case for empty strings */ 12098 if (length == 0) 12099 Py_RETURN_FALSE; 12100 12101 for (i = 0; i < length; i++) { 12102 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 12103 Py_RETURN_FALSE; 12104 } 12105 Py_RETURN_TRUE; 12106 } 12107 12108 int 12109 PyUnicode_IsIdentifier(PyObject *self) 12110 { 12111 int kind; 12112 void *data; 12113 Py_ssize_t i; 12114 Py_UCS4 first; 12115 12116 if (PyUnicode_READY(self) == -1) { 12117 Py_FatalError("identifier not ready"); 12118 return 0; 12119 } 12120 12121 /* Special case for empty strings */ 12122 if (PyUnicode_GET_LENGTH(self) == 0) 12123 return 0; 12124 kind = PyUnicode_KIND(self); 12125 data = PyUnicode_DATA(self); 12126 12127 /* PEP 3131 says that the first character must be in 12128 XID_Start and subsequent characters in XID_Continue, 12129 and for the ASCII range, the 2.x rules apply (i.e 12130 start with letters and underscore, continue with 12131 letters, digits, underscore). However, given the current 12132 definition of XID_Start and XID_Continue, it is sufficient 12133 to check just for these, except that _ must be allowed 12134 as starting an identifier. */ 12135 first = PyUnicode_READ(kind, data, 0); 12136 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 12137 return 0; 12138 12139 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 12140 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 12141 return 0; 12142 return 1; 12143 } 12144 12145 /*[clinic input] 12146 str.isidentifier as unicode_isidentifier 12147 12148 Return True if the string is a valid Python identifier, False otherwise. 12149 12150 Use keyword.iskeyword() to test for reserved identifiers such as "def" and 12151 "class". 12152 [clinic start generated code]*/ 12153 12154 static PyObject * 12155 unicode_isidentifier_impl(PyObject *self) 12156 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/ 12157 { 12158 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 12159 } 12160 12161 /*[clinic input] 12162 str.isprintable as unicode_isprintable 12163 12164 Return True if the string is printable, False otherwise. 12165 12166 A string is printable if all of its characters are considered printable in 12167 repr() or if it is empty. 12168 [clinic start generated code]*/ 12169 12170 static PyObject * 12171 unicode_isprintable_impl(PyObject *self) 12172 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/ 12173 { 12174 Py_ssize_t i, length; 12175 int kind; 12176 void *data; 12177 12178 if (PyUnicode_READY(self) == -1) 12179 return NULL; 12180 length = PyUnicode_GET_LENGTH(self); 12181 kind = PyUnicode_KIND(self); 12182 data = PyUnicode_DATA(self); 12183 12184 /* Shortcut for single character strings */ 12185 if (length == 1) 12186 return PyBool_FromLong( 12187 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 12188 12189 for (i = 0; i < length; i++) { 12190 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 12191 Py_RETURN_FALSE; 12192 } 12193 } 12194 Py_RETURN_TRUE; 12195 } 12196 12197 /*[clinic input] 12198 str.join as unicode_join 12199 12200 iterable: object 12201 / 12202 12203 Concatenate any number of strings. 12204 12205 The string whose method is called is inserted in between each given string. 12206 The result is returned as a new string. 12207 12208 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs' 12209 [clinic start generated code]*/ 12210 12211 static PyObject * 12212 unicode_join(PyObject *self, PyObject *iterable) 12213 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/ 12214 { 12215 return PyUnicode_Join(self, iterable); 12216 } 12217 12218 static Py_ssize_t 12219 unicode_length(PyObject *self) 12220 { 12221 if (PyUnicode_READY(self) == -1) 12222 return -1; 12223 return PyUnicode_GET_LENGTH(self); 12224 } 12225 12226 /*[clinic input] 12227 str.ljust as unicode_ljust 12228 12229 width: Py_ssize_t 12230 fillchar: Py_UCS4 = ' ' 12231 / 12232 12233 Return a left-justified string of length width. 12234 12235 Padding is done using the specified fill character (default is a space). 12236 [clinic start generated code]*/ 12237 12238 static PyObject * 12239 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 12240 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/ 12241 { 12242 if (PyUnicode_READY(self) == -1) 12243 return NULL; 12244 12245 if (PyUnicode_GET_LENGTH(self) >= width) 12246 return unicode_result_unchanged(self); 12247 12248 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 12249 } 12250 12251 /*[clinic input] 12252 str.lower as unicode_lower 12253 12254 Return a copy of the string converted to lowercase. 12255 [clinic start generated code]*/ 12256 12257 static PyObject * 12258 unicode_lower_impl(PyObject *self) 12259 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/ 12260 { 12261 if (PyUnicode_READY(self) == -1) 12262 return NULL; 12263 if (PyUnicode_IS_ASCII(self)) 12264 return ascii_upper_or_lower(self, 1); 12265 return case_operation(self, do_lower); 12266 } 12267 12268 #define LEFTSTRIP 0 12269 #define RIGHTSTRIP 1 12270 #define BOTHSTRIP 2 12271 12272 /* Arrays indexed by above */ 12273 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"}; 12274 12275 #define STRIPNAME(i) (stripfuncnames[i]) 12276 12277 /* externally visible for str.strip(unicode) */ 12278 PyObject * 12279 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 12280 { 12281 void *data; 12282 int kind; 12283 Py_ssize_t i, j, len; 12284 BLOOM_MASK sepmask; 12285 Py_ssize_t seplen; 12286 12287 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12288 return NULL; 12289 12290 kind = PyUnicode_KIND(self); 12291 data = PyUnicode_DATA(self); 12292 len = PyUnicode_GET_LENGTH(self); 12293 seplen = PyUnicode_GET_LENGTH(sepobj); 12294 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12295 PyUnicode_DATA(sepobj), 12296 seplen); 12297 12298 i = 0; 12299 if (striptype != RIGHTSTRIP) { 12300 while (i < len) { 12301 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12302 if (!BLOOM(sepmask, ch)) 12303 break; 12304 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12305 break; 12306 i++; 12307 } 12308 } 12309 12310 j = len; 12311 if (striptype != LEFTSTRIP) { 12312 j--; 12313 while (j >= i) { 12314 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12315 if (!BLOOM(sepmask, ch)) 12316 break; 12317 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12318 break; 12319 j--; 12320 } 12321 12322 j++; 12323 } 12324 12325 return PyUnicode_Substring(self, i, j); 12326 } 12327 12328 PyObject* 12329 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12330 { 12331 unsigned char *data; 12332 int kind; 12333 Py_ssize_t length; 12334 12335 if (PyUnicode_READY(self) == -1) 12336 return NULL; 12337 12338 length = PyUnicode_GET_LENGTH(self); 12339 end = Py_MIN(end, length); 12340 12341 if (start == 0 && end == length) 12342 return unicode_result_unchanged(self); 12343 12344 if (start < 0 || end < 0) { 12345 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12346 return NULL; 12347 } 12348 if (start >= length || end < start) 12349 _Py_RETURN_UNICODE_EMPTY(); 12350 12351 length = end - start; 12352 if (PyUnicode_IS_ASCII(self)) { 12353 data = PyUnicode_1BYTE_DATA(self); 12354 return _PyUnicode_FromASCII((char*)(data + start), length); 12355 } 12356 else { 12357 kind = PyUnicode_KIND(self); 12358 data = PyUnicode_1BYTE_DATA(self); 12359 return PyUnicode_FromKindAndData(kind, 12360 data + kind * start, 12361 length); 12362 } 12363 } 12364 12365 static PyObject * 12366 do_strip(PyObject *self, int striptype) 12367 { 12368 Py_ssize_t len, i, j; 12369 12370 if (PyUnicode_READY(self) == -1) 12371 return NULL; 12372 12373 len = PyUnicode_GET_LENGTH(self); 12374 12375 if (PyUnicode_IS_ASCII(self)) { 12376 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12377 12378 i = 0; 12379 if (striptype != RIGHTSTRIP) { 12380 while (i < len) { 12381 Py_UCS1 ch = data[i]; 12382 if (!_Py_ascii_whitespace[ch]) 12383 break; 12384 i++; 12385 } 12386 } 12387 12388 j = len; 12389 if (striptype != LEFTSTRIP) { 12390 j--; 12391 while (j >= i) { 12392 Py_UCS1 ch = data[j]; 12393 if (!_Py_ascii_whitespace[ch]) 12394 break; 12395 j--; 12396 } 12397 j++; 12398 } 12399 } 12400 else { 12401 int kind = PyUnicode_KIND(self); 12402 void *data = PyUnicode_DATA(self); 12403 12404 i = 0; 12405 if (striptype != RIGHTSTRIP) { 12406 while (i < len) { 12407 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12408 if (!Py_UNICODE_ISSPACE(ch)) 12409 break; 12410 i++; 12411 } 12412 } 12413 12414 j = len; 12415 if (striptype != LEFTSTRIP) { 12416 j--; 12417 while (j >= i) { 12418 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12419 if (!Py_UNICODE_ISSPACE(ch)) 12420 break; 12421 j--; 12422 } 12423 j++; 12424 } 12425 } 12426 12427 return PyUnicode_Substring(self, i, j); 12428 } 12429 12430 12431 static PyObject * 12432 do_argstrip(PyObject *self, int striptype, PyObject *sep) 12433 { 12434 if (sep != NULL && sep != Py_None) { 12435 if (PyUnicode_Check(sep)) 12436 return _PyUnicode_XStrip(self, striptype, sep); 12437 else { 12438 PyErr_Format(PyExc_TypeError, 12439 "%s arg must be None or str", 12440 STRIPNAME(striptype)); 12441 return NULL; 12442 } 12443 } 12444 12445 return do_strip(self, striptype); 12446 } 12447 12448 12449 /*[clinic input] 12450 str.strip as unicode_strip 12451 12452 chars: object = None 12453 / 12454 12455 Return a copy of the string with leading and trailing whitespace remove. 12456 12457 If chars is given and not None, remove characters in chars instead. 12458 [clinic start generated code]*/ 12459 12460 static PyObject * 12461 unicode_strip_impl(PyObject *self, PyObject *chars) 12462 /*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/ 12463 { 12464 return do_argstrip(self, BOTHSTRIP, chars); 12465 } 12466 12467 12468 /*[clinic input] 12469 str.lstrip as unicode_lstrip 12470 12471 chars: object = NULL 12472 / 12473 12474 Return a copy of the string with leading whitespace removed. 12475 12476 If chars is given and not None, remove characters in chars instead. 12477 [clinic start generated code]*/ 12478 12479 static PyObject * 12480 unicode_lstrip_impl(PyObject *self, PyObject *chars) 12481 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/ 12482 { 12483 return do_argstrip(self, LEFTSTRIP, chars); 12484 } 12485 12486 12487 /*[clinic input] 12488 str.rstrip as unicode_rstrip 12489 12490 chars: object = NULL 12491 / 12492 12493 Return a copy of the string with trailing whitespace removed. 12494 12495 If chars is given and not None, remove characters in chars instead. 12496 [clinic start generated code]*/ 12497 12498 static PyObject * 12499 unicode_rstrip_impl(PyObject *self, PyObject *chars) 12500 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/ 12501 { 12502 return do_argstrip(self, RIGHTSTRIP, chars); 12503 } 12504 12505 12506 static PyObject* 12507 unicode_repeat(PyObject *str, Py_ssize_t len) 12508 { 12509 PyObject *u; 12510 Py_ssize_t nchars, n; 12511 12512 if (len < 1) 12513 _Py_RETURN_UNICODE_EMPTY(); 12514 12515 /* no repeat, return original string */ 12516 if (len == 1) 12517 return unicode_result_unchanged(str); 12518 12519 if (PyUnicode_READY(str) == -1) 12520 return NULL; 12521 12522 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12523 PyErr_SetString(PyExc_OverflowError, 12524 "repeated string is too long"); 12525 return NULL; 12526 } 12527 nchars = len * PyUnicode_GET_LENGTH(str); 12528 12529 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12530 if (!u) 12531 return NULL; 12532 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12533 12534 if (PyUnicode_GET_LENGTH(str) == 1) { 12535 const int kind = PyUnicode_KIND(str); 12536 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12537 if (kind == PyUnicode_1BYTE_KIND) { 12538 void *to = PyUnicode_DATA(u); 12539 memset(to, (unsigned char)fill_char, len); 12540 } 12541 else if (kind == PyUnicode_2BYTE_KIND) { 12542 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12543 for (n = 0; n < len; ++n) 12544 ucs2[n] = fill_char; 12545 } else { 12546 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12547 assert(kind == PyUnicode_4BYTE_KIND); 12548 for (n = 0; n < len; ++n) 12549 ucs4[n] = fill_char; 12550 } 12551 } 12552 else { 12553 /* number of characters copied this far */ 12554 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12555 const Py_ssize_t char_size = PyUnicode_KIND(str); 12556 char *to = (char *) PyUnicode_DATA(u); 12557 memcpy(to, PyUnicode_DATA(str), 12558 PyUnicode_GET_LENGTH(str) * char_size); 12559 while (done < nchars) { 12560 n = (done <= nchars-done) ? done : nchars-done; 12561 memcpy(to + (done * char_size), to, n * char_size); 12562 done += n; 12563 } 12564 } 12565 12566 assert(_PyUnicode_CheckConsistency(u, 1)); 12567 return u; 12568 } 12569 12570 PyObject * 12571 PyUnicode_Replace(PyObject *str, 12572 PyObject *substr, 12573 PyObject *replstr, 12574 Py_ssize_t maxcount) 12575 { 12576 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || 12577 ensure_unicode(replstr) < 0) 12578 return NULL; 12579 return replace(str, substr, replstr, maxcount); 12580 } 12581 12582 /*[clinic input] 12583 str.replace as unicode_replace 12584 12585 old: unicode 12586 new: unicode 12587 count: Py_ssize_t = -1 12588 Maximum number of occurrences to replace. 12589 -1 (the default value) means replace all occurrences. 12590 / 12591 12592 Return a copy with all occurrences of substring old replaced by new. 12593 12594 If the optional argument count is given, only the first count occurrences are 12595 replaced. 12596 [clinic start generated code]*/ 12597 12598 static PyObject * 12599 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new, 12600 Py_ssize_t count) 12601 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/ 12602 { 12603 if (PyUnicode_READY(self) == -1) 12604 return NULL; 12605 return replace(self, old, new, count); 12606 } 12607 12608 static PyObject * 12609 unicode_repr(PyObject *unicode) 12610 { 12611 PyObject *repr; 12612 Py_ssize_t isize; 12613 Py_ssize_t osize, squote, dquote, i, o; 12614 Py_UCS4 max, quote; 12615 int ikind, okind, unchanged; 12616 void *idata, *odata; 12617 12618 if (PyUnicode_READY(unicode) == -1) 12619 return NULL; 12620 12621 isize = PyUnicode_GET_LENGTH(unicode); 12622 idata = PyUnicode_DATA(unicode); 12623 12624 /* Compute length of output, quote characters, and 12625 maximum character */ 12626 osize = 0; 12627 max = 127; 12628 squote = dquote = 0; 12629 ikind = PyUnicode_KIND(unicode); 12630 for (i = 0; i < isize; i++) { 12631 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12632 Py_ssize_t incr = 1; 12633 switch (ch) { 12634 case '\'': squote++; break; 12635 case '"': dquote++; break; 12636 case '\\': case '\t': case '\r': case '\n': 12637 incr = 2; 12638 break; 12639 default: 12640 /* Fast-path ASCII */ 12641 if (ch < ' ' || ch == 0x7f) 12642 incr = 4; /* \xHH */ 12643 else if (ch < 0x7f) 12644 ; 12645 else if (Py_UNICODE_ISPRINTABLE(ch)) 12646 max = ch > max ? ch : max; 12647 else if (ch < 0x100) 12648 incr = 4; /* \xHH */ 12649 else if (ch < 0x10000) 12650 incr = 6; /* \uHHHH */ 12651 else 12652 incr = 10; /* \uHHHHHHHH */ 12653 } 12654 if (osize > PY_SSIZE_T_MAX - incr) { 12655 PyErr_SetString(PyExc_OverflowError, 12656 "string is too long to generate repr"); 12657 return NULL; 12658 } 12659 osize += incr; 12660 } 12661 12662 quote = '\''; 12663 unchanged = (osize == isize); 12664 if (squote) { 12665 unchanged = 0; 12666 if (dquote) 12667 /* Both squote and dquote present. Use squote, 12668 and escape them */ 12669 osize += squote; 12670 else 12671 quote = '"'; 12672 } 12673 osize += 2; /* quotes */ 12674 12675 repr = PyUnicode_New(osize, max); 12676 if (repr == NULL) 12677 return NULL; 12678 okind = PyUnicode_KIND(repr); 12679 odata = PyUnicode_DATA(repr); 12680 12681 PyUnicode_WRITE(okind, odata, 0, quote); 12682 PyUnicode_WRITE(okind, odata, osize-1, quote); 12683 if (unchanged) { 12684 _PyUnicode_FastCopyCharacters(repr, 1, 12685 unicode, 0, 12686 isize); 12687 } 12688 else { 12689 for (i = 0, o = 1; i < isize; i++) { 12690 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12691 12692 /* Escape quotes and backslashes */ 12693 if ((ch == quote) || (ch == '\\')) { 12694 PyUnicode_WRITE(okind, odata, o++, '\\'); 12695 PyUnicode_WRITE(okind, odata, o++, ch); 12696 continue; 12697 } 12698 12699 /* Map special whitespace to '\t', \n', '\r' */ 12700 if (ch == '\t') { 12701 PyUnicode_WRITE(okind, odata, o++, '\\'); 12702 PyUnicode_WRITE(okind, odata, o++, 't'); 12703 } 12704 else if (ch == '\n') { 12705 PyUnicode_WRITE(okind, odata, o++, '\\'); 12706 PyUnicode_WRITE(okind, odata, o++, 'n'); 12707 } 12708 else if (ch == '\r') { 12709 PyUnicode_WRITE(okind, odata, o++, '\\'); 12710 PyUnicode_WRITE(okind, odata, o++, 'r'); 12711 } 12712 12713 /* Map non-printable US ASCII to '\xhh' */ 12714 else if (ch < ' ' || ch == 0x7F) { 12715 PyUnicode_WRITE(okind, odata, o++, '\\'); 12716 PyUnicode_WRITE(okind, odata, o++, 'x'); 12717 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12718 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12719 } 12720 12721 /* Copy ASCII characters as-is */ 12722 else if (ch < 0x7F) { 12723 PyUnicode_WRITE(okind, odata, o++, ch); 12724 } 12725 12726 /* Non-ASCII characters */ 12727 else { 12728 /* Map Unicode whitespace and control characters 12729 (categories Z* and C* except ASCII space) 12730 */ 12731 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12732 PyUnicode_WRITE(okind, odata, o++, '\\'); 12733 /* Map 8-bit characters to '\xhh' */ 12734 if (ch <= 0xff) { 12735 PyUnicode_WRITE(okind, odata, o++, 'x'); 12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12738 } 12739 /* Map 16-bit characters to '\uxxxx' */ 12740 else if (ch <= 0xffff) { 12741 PyUnicode_WRITE(okind, odata, o++, 'u'); 12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12746 } 12747 /* Map 21-bit characters to '\U00xxxxxx' */ 12748 else { 12749 PyUnicode_WRITE(okind, odata, o++, 'U'); 12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12758 } 12759 } 12760 /* Copy characters as-is */ 12761 else { 12762 PyUnicode_WRITE(okind, odata, o++, ch); 12763 } 12764 } 12765 } 12766 } 12767 /* Closing quote already added at the beginning */ 12768 assert(_PyUnicode_CheckConsistency(repr, 1)); 12769 return repr; 12770 } 12771 12772 PyDoc_STRVAR(rfind__doc__, 12773 "S.rfind(sub[, start[, end]]) -> int\n\ 12774 \n\ 12775 Return the highest index in S where substring sub is found,\n\ 12776 such that sub is contained within S[start:end]. Optional\n\ 12777 arguments start and end are interpreted as in slice notation.\n\ 12778 \n\ 12779 Return -1 on failure."); 12780 12781 static PyObject * 12782 unicode_rfind(PyObject *self, PyObject *args) 12783 { 12784 /* initialize variables to prevent gcc warning */ 12785 PyObject *substring = NULL; 12786 Py_ssize_t start = 0; 12787 Py_ssize_t end = 0; 12788 Py_ssize_t result; 12789 12790 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) 12791 return NULL; 12792 12793 if (PyUnicode_READY(self) == -1) 12794 return NULL; 12795 12796 result = any_find_slice(self, substring, start, end, -1); 12797 12798 if (result == -2) 12799 return NULL; 12800 12801 return PyLong_FromSsize_t(result); 12802 } 12803 12804 PyDoc_STRVAR(rindex__doc__, 12805 "S.rindex(sub[, start[, end]]) -> int\n\ 12806 \n\ 12807 Return the highest index in S where substring sub is found,\n\ 12808 such that sub is contained within S[start:end]. Optional\n\ 12809 arguments start and end are interpreted as in slice notation.\n\ 12810 \n\ 12811 Raises ValueError when the substring is not found."); 12812 12813 static PyObject * 12814 unicode_rindex(PyObject *self, PyObject *args) 12815 { 12816 /* initialize variables to prevent gcc warning */ 12817 PyObject *substring = NULL; 12818 Py_ssize_t start = 0; 12819 Py_ssize_t end = 0; 12820 Py_ssize_t result; 12821 12822 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) 12823 return NULL; 12824 12825 if (PyUnicode_READY(self) == -1) 12826 return NULL; 12827 12828 result = any_find_slice(self, substring, start, end, -1); 12829 12830 if (result == -2) 12831 return NULL; 12832 12833 if (result < 0) { 12834 PyErr_SetString(PyExc_ValueError, "substring not found"); 12835 return NULL; 12836 } 12837 12838 return PyLong_FromSsize_t(result); 12839 } 12840 12841 /*[clinic input] 12842 str.rjust as unicode_rjust 12843 12844 width: Py_ssize_t 12845 fillchar: Py_UCS4 = ' ' 12846 / 12847 12848 Return a right-justified string of length width. 12849 12850 Padding is done using the specified fill character (default is a space). 12851 [clinic start generated code]*/ 12852 12853 static PyObject * 12854 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) 12855 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/ 12856 { 12857 if (PyUnicode_READY(self) == -1) 12858 return NULL; 12859 12860 if (PyUnicode_GET_LENGTH(self) >= width) 12861 return unicode_result_unchanged(self); 12862 12863 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12864 } 12865 12866 PyObject * 12867 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12868 { 12869 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 12870 return NULL; 12871 12872 return split(s, sep, maxsplit); 12873 } 12874 12875 /*[clinic input] 12876 str.split as unicode_split 12877 12878 sep: object = None 12879 The delimiter according which to split the string. 12880 None (the default value) means split according to any whitespace, 12881 and discard empty strings from the result. 12882 maxsplit: Py_ssize_t = -1 12883 Maximum number of splits to do. 12884 -1 (the default value) means no limit. 12885 12886 Return a list of the words in the string, using sep as the delimiter string. 12887 [clinic start generated code]*/ 12888 12889 static PyObject * 12890 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 12891 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/ 12892 { 12893 if (sep == Py_None) 12894 return split(self, NULL, maxsplit); 12895 if (PyUnicode_Check(sep)) 12896 return split(self, sep, maxsplit); 12897 12898 PyErr_Format(PyExc_TypeError, 12899 "must be str or None, not %.100s", 12900 Py_TYPE(sep)->tp_name); 12901 return NULL; 12902 } 12903 12904 PyObject * 12905 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) 12906 { 12907 PyObject* out; 12908 int kind1, kind2; 12909 void *buf1, *buf2; 12910 Py_ssize_t len1, len2; 12911 12912 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12913 return NULL; 12914 12915 kind1 = PyUnicode_KIND(str_obj); 12916 kind2 = PyUnicode_KIND(sep_obj); 12917 len1 = PyUnicode_GET_LENGTH(str_obj); 12918 len2 = PyUnicode_GET_LENGTH(sep_obj); 12919 if (kind1 < kind2 || len1 < len2) { 12920 _Py_INCREF_UNICODE_EMPTY(); 12921 if (!unicode_empty) 12922 out = NULL; 12923 else { 12924 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12925 Py_DECREF(unicode_empty); 12926 } 12927 return out; 12928 } 12929 buf1 = PyUnicode_DATA(str_obj); 12930 buf2 = PyUnicode_DATA(sep_obj); 12931 if (kind2 != kind1) { 12932 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12933 if (!buf2) 12934 return NULL; 12935 } 12936 12937 switch (kind1) { 12938 case PyUnicode_1BYTE_KIND: 12939 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12940 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12941 else 12942 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12943 break; 12944 case PyUnicode_2BYTE_KIND: 12945 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12946 break; 12947 case PyUnicode_4BYTE_KIND: 12948 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12949 break; 12950 default: 12951 Py_UNREACHABLE(); 12952 } 12953 12954 if (kind2 != kind1) 12955 PyMem_Free(buf2); 12956 12957 return out; 12958 } 12959 12960 12961 PyObject * 12962 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) 12963 { 12964 PyObject* out; 12965 int kind1, kind2; 12966 void *buf1, *buf2; 12967 Py_ssize_t len1, len2; 12968 12969 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) 12970 return NULL; 12971 12972 kind1 = PyUnicode_KIND(str_obj); 12973 kind2 = PyUnicode_KIND(sep_obj); 12974 len1 = PyUnicode_GET_LENGTH(str_obj); 12975 len2 = PyUnicode_GET_LENGTH(sep_obj); 12976 if (kind1 < kind2 || len1 < len2) { 12977 _Py_INCREF_UNICODE_EMPTY(); 12978 if (!unicode_empty) 12979 out = NULL; 12980 else { 12981 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12982 Py_DECREF(unicode_empty); 12983 } 12984 return out; 12985 } 12986 buf1 = PyUnicode_DATA(str_obj); 12987 buf2 = PyUnicode_DATA(sep_obj); 12988 if (kind2 != kind1) { 12989 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12990 if (!buf2) 12991 return NULL; 12992 } 12993 12994 switch (kind1) { 12995 case PyUnicode_1BYTE_KIND: 12996 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12997 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12998 else 12999 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13000 break; 13001 case PyUnicode_2BYTE_KIND: 13002 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13003 break; 13004 case PyUnicode_4BYTE_KIND: 13005 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 13006 break; 13007 default: 13008 Py_UNREACHABLE(); 13009 } 13010 13011 if (kind2 != kind1) 13012 PyMem_Free(buf2); 13013 13014 return out; 13015 } 13016 13017 /*[clinic input] 13018 str.partition as unicode_partition 13019 13020 sep: object 13021 / 13022 13023 Partition the string into three parts using the given separator. 13024 13025 This will search for the separator in the string. If the separator is found, 13026 returns a 3-tuple containing the part before the separator, the separator 13027 itself, and the part after it. 13028 13029 If the separator is not found, returns a 3-tuple containing the original string 13030 and two empty strings. 13031 [clinic start generated code]*/ 13032 13033 static PyObject * 13034 unicode_partition(PyObject *self, PyObject *sep) 13035 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/ 13036 { 13037 return PyUnicode_Partition(self, sep); 13038 } 13039 13040 /*[clinic input] 13041 str.rpartition as unicode_rpartition = str.partition 13042 13043 Partition the string into three parts using the given separator. 13044 13045 This will search for the separator in the string, starting at the end. If 13046 the separator is found, returns a 3-tuple containing the part before the 13047 separator, the separator itself, and the part after it. 13048 13049 If the separator is not found, returns a 3-tuple containing two empty strings 13050 and the original string. 13051 [clinic start generated code]*/ 13052 13053 static PyObject * 13054 unicode_rpartition(PyObject *self, PyObject *sep) 13055 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/ 13056 { 13057 return PyUnicode_RPartition(self, sep); 13058 } 13059 13060 PyObject * 13061 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 13062 { 13063 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) 13064 return NULL; 13065 13066 return rsplit(s, sep, maxsplit); 13067 } 13068 13069 /*[clinic input] 13070 str.rsplit as unicode_rsplit = str.split 13071 13072 Return a list of the words in the string, using sep as the delimiter string. 13073 13074 Splits are done starting at the end of the string and working to the front. 13075 [clinic start generated code]*/ 13076 13077 static PyObject * 13078 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) 13079 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/ 13080 { 13081 if (sep == Py_None) 13082 return rsplit(self, NULL, maxsplit); 13083 if (PyUnicode_Check(sep)) 13084 return rsplit(self, sep, maxsplit); 13085 13086 PyErr_Format(PyExc_TypeError, 13087 "must be str or None, not %.100s", 13088 Py_TYPE(sep)->tp_name); 13089 return NULL; 13090 } 13091 13092 /*[clinic input] 13093 str.splitlines as unicode_splitlines 13094 13095 keepends: bool(accept={int}) = False 13096 13097 Return a list of the lines in the string, breaking at line boundaries. 13098 13099 Line breaks are not included in the resulting list unless keepends is given and 13100 true. 13101 [clinic start generated code]*/ 13102 13103 static PyObject * 13104 unicode_splitlines_impl(PyObject *self, int keepends) 13105 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/ 13106 { 13107 return PyUnicode_Splitlines(self, keepends); 13108 } 13109 13110 static 13111 PyObject *unicode_str(PyObject *self) 13112 { 13113 return unicode_result_unchanged(self); 13114 } 13115 13116 /*[clinic input] 13117 str.swapcase as unicode_swapcase 13118 13119 Convert uppercase characters to lowercase and lowercase characters to uppercase. 13120 [clinic start generated code]*/ 13121 13122 static PyObject * 13123 unicode_swapcase_impl(PyObject *self) 13124 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/ 13125 { 13126 if (PyUnicode_READY(self) == -1) 13127 return NULL; 13128 return case_operation(self, do_swapcase); 13129 } 13130 13131 /*[clinic input] 13132 13133 @staticmethod 13134 str.maketrans as unicode_maketrans 13135 13136 x: object 13137 13138 y: unicode=NULL 13139 13140 z: unicode=NULL 13141 13142 / 13143 13144 Return a translation table usable for str.translate(). 13145 13146 If there is only one argument, it must be a dictionary mapping Unicode 13147 ordinals (integers) or characters to Unicode ordinals, strings or None. 13148 Character keys will be then converted to ordinals. 13149 If there are two arguments, they must be strings of equal length, and 13150 in the resulting dictionary, each character in x will be mapped to the 13151 character at the same position in y. If there is a third argument, it 13152 must be a string, whose characters will be mapped to None in the result. 13153 [clinic start generated code]*/ 13154 13155 static PyObject * 13156 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 13157 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 13158 { 13159 PyObject *new = NULL, *key, *value; 13160 Py_ssize_t i = 0; 13161 int res; 13162 13163 new = PyDict_New(); 13164 if (!new) 13165 return NULL; 13166 if (y != NULL) { 13167 int x_kind, y_kind, z_kind; 13168 void *x_data, *y_data, *z_data; 13169 13170 /* x must be a string too, of equal length */ 13171 if (!PyUnicode_Check(x)) { 13172 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13173 "be a string if there is a second argument"); 13174 goto err; 13175 } 13176 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13177 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13178 "arguments must have equal length"); 13179 goto err; 13180 } 13181 /* create entries for translating chars in x to those in y */ 13182 x_kind = PyUnicode_KIND(x); 13183 y_kind = PyUnicode_KIND(y); 13184 x_data = PyUnicode_DATA(x); 13185 y_data = PyUnicode_DATA(y); 13186 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13187 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13188 if (!key) 13189 goto err; 13190 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13191 if (!value) { 13192 Py_DECREF(key); 13193 goto err; 13194 } 13195 res = PyDict_SetItem(new, key, value); 13196 Py_DECREF(key); 13197 Py_DECREF(value); 13198 if (res < 0) 13199 goto err; 13200 } 13201 /* create entries for deleting chars in z */ 13202 if (z != NULL) { 13203 z_kind = PyUnicode_KIND(z); 13204 z_data = PyUnicode_DATA(z); 13205 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13206 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13207 if (!key) 13208 goto err; 13209 res = PyDict_SetItem(new, key, Py_None); 13210 Py_DECREF(key); 13211 if (res < 0) 13212 goto err; 13213 } 13214 } 13215 } else { 13216 int kind; 13217 void *data; 13218 13219 /* x must be a dict */ 13220 if (!PyDict_CheckExact(x)) { 13221 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13222 "to maketrans it must be a dict"); 13223 goto err; 13224 } 13225 /* copy entries into the new dict, converting string keys to int keys */ 13226 while (PyDict_Next(x, &i, &key, &value)) { 13227 if (PyUnicode_Check(key)) { 13228 /* convert string keys to integer keys */ 13229 PyObject *newkey; 13230 if (PyUnicode_GET_LENGTH(key) != 1) { 13231 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13232 "table must be of length 1"); 13233 goto err; 13234 } 13235 kind = PyUnicode_KIND(key); 13236 data = PyUnicode_DATA(key); 13237 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13238 if (!newkey) 13239 goto err; 13240 res = PyDict_SetItem(new, newkey, value); 13241 Py_DECREF(newkey); 13242 if (res < 0) 13243 goto err; 13244 } else if (PyLong_Check(key)) { 13245 /* just keep integer keys */ 13246 if (PyDict_SetItem(new, key, value) < 0) 13247 goto err; 13248 } else { 13249 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13250 "be strings or integers"); 13251 goto err; 13252 } 13253 } 13254 } 13255 return new; 13256 err: 13257 Py_DECREF(new); 13258 return NULL; 13259 } 13260 13261 /*[clinic input] 13262 str.translate as unicode_translate 13263 13264 table: object 13265 Translation table, which must be a mapping of Unicode ordinals to 13266 Unicode ordinals, strings, or None. 13267 / 13268 13269 Replace each character in the string using the given translation table. 13270 13271 The table must implement lookup/indexing via __getitem__, for instance a 13272 dictionary or list. If this operation raises LookupError, the character is 13273 left untouched. Characters mapped to None are deleted. 13274 [clinic start generated code]*/ 13275 13276 static PyObject * 13277 unicode_translate(PyObject *self, PyObject *table) 13278 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/ 13279 { 13280 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13281 } 13282 13283 /*[clinic input] 13284 str.upper as unicode_upper 13285 13286 Return a copy of the string converted to uppercase. 13287 [clinic start generated code]*/ 13288 13289 static PyObject * 13290 unicode_upper_impl(PyObject *self) 13291 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/ 13292 { 13293 if (PyUnicode_READY(self) == -1) 13294 return NULL; 13295 if (PyUnicode_IS_ASCII(self)) 13296 return ascii_upper_or_lower(self, 0); 13297 return case_operation(self, do_upper); 13298 } 13299 13300 /*[clinic input] 13301 str.zfill as unicode_zfill 13302 13303 width: Py_ssize_t 13304 / 13305 13306 Pad a numeric string with zeros on the left, to fill a field of the given width. 13307 13308 The string is never truncated. 13309 [clinic start generated code]*/ 13310 13311 static PyObject * 13312 unicode_zfill_impl(PyObject *self, Py_ssize_t width) 13313 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/ 13314 { 13315 Py_ssize_t fill; 13316 PyObject *u; 13317 int kind; 13318 void *data; 13319 Py_UCS4 chr; 13320 13321 if (PyUnicode_READY(self) == -1) 13322 return NULL; 13323 13324 if (PyUnicode_GET_LENGTH(self) >= width) 13325 return unicode_result_unchanged(self); 13326 13327 fill = width - PyUnicode_GET_LENGTH(self); 13328 13329 u = pad(self, fill, 0, '0'); 13330 13331 if (u == NULL) 13332 return NULL; 13333 13334 kind = PyUnicode_KIND(u); 13335 data = PyUnicode_DATA(u); 13336 chr = PyUnicode_READ(kind, data, fill); 13337 13338 if (chr == '+' || chr == '-') { 13339 /* move sign to beginning of string */ 13340 PyUnicode_WRITE(kind, data, 0, chr); 13341 PyUnicode_WRITE(kind, data, fill, '0'); 13342 } 13343 13344 assert(_PyUnicode_CheckConsistency(u, 1)); 13345 return u; 13346 } 13347 13348 #if 0 13349 static PyObject * 13350 unicode__decimal2ascii(PyObject *self) 13351 { 13352 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13353 } 13354 #endif 13355 13356 PyDoc_STRVAR(startswith__doc__, 13357 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13358 \n\ 13359 Return True if S starts with the specified prefix, False otherwise.\n\ 13360 With optional start, test S beginning at that position.\n\ 13361 With optional end, stop comparing S at that position.\n\ 13362 prefix can also be a tuple of strings to try."); 13363 13364 static PyObject * 13365 unicode_startswith(PyObject *self, 13366 PyObject *args) 13367 { 13368 PyObject *subobj; 13369 PyObject *substring; 13370 Py_ssize_t start = 0; 13371 Py_ssize_t end = PY_SSIZE_T_MAX; 13372 int result; 13373 13374 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13375 return NULL; 13376 if (PyTuple_Check(subobj)) { 13377 Py_ssize_t i; 13378 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13379 substring = PyTuple_GET_ITEM(subobj, i); 13380 if (!PyUnicode_Check(substring)) { 13381 PyErr_Format(PyExc_TypeError, 13382 "tuple for startswith must only contain str, " 13383 "not %.100s", 13384 Py_TYPE(substring)->tp_name); 13385 return NULL; 13386 } 13387 result = tailmatch(self, substring, start, end, -1); 13388 if (result == -1) 13389 return NULL; 13390 if (result) { 13391 Py_RETURN_TRUE; 13392 } 13393 } 13394 /* nothing matched */ 13395 Py_RETURN_FALSE; 13396 } 13397 if (!PyUnicode_Check(subobj)) { 13398 PyErr_Format(PyExc_TypeError, 13399 "startswith first arg must be str or " 13400 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13401 return NULL; 13402 } 13403 result = tailmatch(self, subobj, start, end, -1); 13404 if (result == -1) 13405 return NULL; 13406 return PyBool_FromLong(result); 13407 } 13408 13409 13410 PyDoc_STRVAR(endswith__doc__, 13411 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13412 \n\ 13413 Return True if S ends with the specified suffix, False otherwise.\n\ 13414 With optional start, test S beginning at that position.\n\ 13415 With optional end, stop comparing S at that position.\n\ 13416 suffix can also be a tuple of strings to try."); 13417 13418 static PyObject * 13419 unicode_endswith(PyObject *self, 13420 PyObject *args) 13421 { 13422 PyObject *subobj; 13423 PyObject *substring; 13424 Py_ssize_t start = 0; 13425 Py_ssize_t end = PY_SSIZE_T_MAX; 13426 int result; 13427 13428 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13429 return NULL; 13430 if (PyTuple_Check(subobj)) { 13431 Py_ssize_t i; 13432 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13433 substring = PyTuple_GET_ITEM(subobj, i); 13434 if (!PyUnicode_Check(substring)) { 13435 PyErr_Format(PyExc_TypeError, 13436 "tuple for endswith must only contain str, " 13437 "not %.100s", 13438 Py_TYPE(substring)->tp_name); 13439 return NULL; 13440 } 13441 result = tailmatch(self, substring, start, end, +1); 13442 if (result == -1) 13443 return NULL; 13444 if (result) { 13445 Py_RETURN_TRUE; 13446 } 13447 } 13448 Py_RETURN_FALSE; 13449 } 13450 if (!PyUnicode_Check(subobj)) { 13451 PyErr_Format(PyExc_TypeError, 13452 "endswith first arg must be str or " 13453 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); 13454 return NULL; 13455 } 13456 result = tailmatch(self, subobj, start, end, +1); 13457 if (result == -1) 13458 return NULL; 13459 return PyBool_FromLong(result); 13460 } 13461 13462 static inline void 13463 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13464 { 13465 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13466 writer->data = PyUnicode_DATA(writer->buffer); 13467 13468 if (!writer->readonly) { 13469 writer->kind = PyUnicode_KIND(writer->buffer); 13470 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13471 } 13472 else { 13473 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13474 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13475 writer->kind = PyUnicode_WCHAR_KIND; 13476 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13477 13478 /* Copy-on-write mode: set buffer size to 0 so 13479 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13480 * next write. */ 13481 writer->size = 0; 13482 } 13483 } 13484 13485 void 13486 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13487 { 13488 memset(writer, 0, sizeof(*writer)); 13489 13490 /* ASCII is the bare minimum */ 13491 writer->min_char = 127; 13492 13493 /* use a value smaller than PyUnicode_1BYTE_KIND() so 13494 _PyUnicodeWriter_PrepareKind() will copy the buffer. */ 13495 writer->kind = PyUnicode_WCHAR_KIND; 13496 assert(writer->kind <= PyUnicode_1BYTE_KIND); 13497 } 13498 13499 int 13500 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13501 Py_ssize_t length, Py_UCS4 maxchar) 13502 { 13503 Py_ssize_t newlen; 13504 PyObject *newbuffer; 13505 13506 assert(maxchar <= MAX_UNICODE); 13507 13508 /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13509 assert((maxchar > writer->maxchar && length >= 0) 13510 || length > 0); 13511 13512 if (length > PY_SSIZE_T_MAX - writer->pos) { 13513 PyErr_NoMemory(); 13514 return -1; 13515 } 13516 newlen = writer->pos + length; 13517 13518 maxchar = Py_MAX(maxchar, writer->min_char); 13519 13520 if (writer->buffer == NULL) { 13521 assert(!writer->readonly); 13522 if (writer->overallocate 13523 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13524 /* overallocate to limit the number of realloc() */ 13525 newlen += newlen / OVERALLOCATE_FACTOR; 13526 } 13527 if (newlen < writer->min_length) 13528 newlen = writer->min_length; 13529 13530 writer->buffer = PyUnicode_New(newlen, maxchar); 13531 if (writer->buffer == NULL) 13532 return -1; 13533 } 13534 else if (newlen > writer->size) { 13535 if (writer->overallocate 13536 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13537 /* overallocate to limit the number of realloc() */ 13538 newlen += newlen / OVERALLOCATE_FACTOR; 13539 } 13540 if (newlen < writer->min_length) 13541 newlen = writer->min_length; 13542 13543 if (maxchar > writer->maxchar || writer->readonly) { 13544 /* resize + widen */ 13545 maxchar = Py_MAX(maxchar, writer->maxchar); 13546 newbuffer = PyUnicode_New(newlen, maxchar); 13547 if (newbuffer == NULL) 13548 return -1; 13549 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13550 writer->buffer, 0, writer->pos); 13551 Py_DECREF(writer->buffer); 13552 writer->readonly = 0; 13553 } 13554 else { 13555 newbuffer = resize_compact(writer->buffer, newlen); 13556 if (newbuffer == NULL) 13557 return -1; 13558 } 13559 writer->buffer = newbuffer; 13560 } 13561 else if (maxchar > writer->maxchar) { 13562 assert(!writer->readonly); 13563 newbuffer = PyUnicode_New(writer->size, maxchar); 13564 if (newbuffer == NULL) 13565 return -1; 13566 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13567 writer->buffer, 0, writer->pos); 13568 Py_SETREF(writer->buffer, newbuffer); 13569 } 13570 _PyUnicodeWriter_Update(writer); 13571 return 0; 13572 13573 #undef OVERALLOCATE_FACTOR 13574 } 13575 13576 int 13577 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13578 enum PyUnicode_Kind kind) 13579 { 13580 Py_UCS4 maxchar; 13581 13582 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13583 assert(writer->kind < kind); 13584 13585 switch (kind) 13586 { 13587 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13588 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13589 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; 13590 default: 13591 Py_UNREACHABLE(); 13592 } 13593 13594 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13595 } 13596 13597 static inline int 13598 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13599 { 13600 assert(ch <= MAX_UNICODE); 13601 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13602 return -1; 13603 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13604 writer->pos++; 13605 return 0; 13606 } 13607 13608 int 13609 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13610 { 13611 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13612 } 13613 13614 int 13615 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13616 { 13617 Py_UCS4 maxchar; 13618 Py_ssize_t len; 13619 13620 if (PyUnicode_READY(str) == -1) 13621 return -1; 13622 len = PyUnicode_GET_LENGTH(str); 13623 if (len == 0) 13624 return 0; 13625 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13626 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13627 if (writer->buffer == NULL && !writer->overallocate) { 13628 assert(_PyUnicode_CheckConsistency(str, 1)); 13629 writer->readonly = 1; 13630 Py_INCREF(str); 13631 writer->buffer = str; 13632 _PyUnicodeWriter_Update(writer); 13633 writer->pos += len; 13634 return 0; 13635 } 13636 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13637 return -1; 13638 } 13639 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13640 str, 0, len); 13641 writer->pos += len; 13642 return 0; 13643 } 13644 13645 int 13646 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13647 Py_ssize_t start, Py_ssize_t end) 13648 { 13649 Py_UCS4 maxchar; 13650 Py_ssize_t len; 13651 13652 if (PyUnicode_READY(str) == -1) 13653 return -1; 13654 13655 assert(0 <= start); 13656 assert(end <= PyUnicode_GET_LENGTH(str)); 13657 assert(start <= end); 13658 13659 if (end == 0) 13660 return 0; 13661 13662 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13663 return _PyUnicodeWriter_WriteStr(writer, str); 13664 13665 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13666 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13667 else 13668 maxchar = writer->maxchar; 13669 len = end - start; 13670 13671 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13672 return -1; 13673 13674 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13675 str, start, len); 13676 writer->pos += len; 13677 return 0; 13678 } 13679 13680 int 13681 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13682 const char *ascii, Py_ssize_t len) 13683 { 13684 if (len == -1) 13685 len = strlen(ascii); 13686 13687 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13688 13689 if (writer->buffer == NULL && !writer->overallocate) { 13690 PyObject *str; 13691 13692 str = _PyUnicode_FromASCII(ascii, len); 13693 if (str == NULL) 13694 return -1; 13695 13696 writer->readonly = 1; 13697 writer->buffer = str; 13698 _PyUnicodeWriter_Update(writer); 13699 writer->pos += len; 13700 return 0; 13701 } 13702 13703 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13704 return -1; 13705 13706 switch (writer->kind) 13707 { 13708 case PyUnicode_1BYTE_KIND: 13709 { 13710 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13711 Py_UCS1 *data = writer->data; 13712 13713 memcpy(data + writer->pos, str, len); 13714 break; 13715 } 13716 case PyUnicode_2BYTE_KIND: 13717 { 13718 _PyUnicode_CONVERT_BYTES( 13719 Py_UCS1, Py_UCS2, 13720 ascii, ascii + len, 13721 (Py_UCS2 *)writer->data + writer->pos); 13722 break; 13723 } 13724 case PyUnicode_4BYTE_KIND: 13725 { 13726 _PyUnicode_CONVERT_BYTES( 13727 Py_UCS1, Py_UCS4, 13728 ascii, ascii + len, 13729 (Py_UCS4 *)writer->data + writer->pos); 13730 break; 13731 } 13732 default: 13733 Py_UNREACHABLE(); 13734 } 13735 13736 writer->pos += len; 13737 return 0; 13738 } 13739 13740 int 13741 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13742 const char *str, Py_ssize_t len) 13743 { 13744 Py_UCS4 maxchar; 13745 13746 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13747 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13748 return -1; 13749 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13750 writer->pos += len; 13751 return 0; 13752 } 13753 13754 PyObject * 13755 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13756 { 13757 PyObject *str; 13758 13759 if (writer->pos == 0) { 13760 Py_CLEAR(writer->buffer); 13761 _Py_RETURN_UNICODE_EMPTY(); 13762 } 13763 13764 str = writer->buffer; 13765 writer->buffer = NULL; 13766 13767 if (writer->readonly) { 13768 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13769 return str; 13770 } 13771 13772 if (PyUnicode_GET_LENGTH(str) != writer->pos) { 13773 PyObject *str2; 13774 str2 = resize_compact(str, writer->pos); 13775 if (str2 == NULL) { 13776 Py_DECREF(str); 13777 return NULL; 13778 } 13779 str = str2; 13780 } 13781 13782 assert(_PyUnicode_CheckConsistency(str, 1)); 13783 return unicode_result_ready(str); 13784 } 13785 13786 void 13787 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13788 { 13789 Py_CLEAR(writer->buffer); 13790 } 13791 13792 #include "stringlib/unicode_format.h" 13793 13794 PyDoc_STRVAR(format__doc__, 13795 "S.format(*args, **kwargs) -> str\n\ 13796 \n\ 13797 Return a formatted version of S, using substitutions from args and kwargs.\n\ 13798 The substitutions are identified by braces ('{' and '}')."); 13799 13800 PyDoc_STRVAR(format_map__doc__, 13801 "S.format_map(mapping) -> str\n\ 13802 \n\ 13803 Return a formatted version of S, using substitutions from mapping.\n\ 13804 The substitutions are identified by braces ('{' and '}')."); 13805 13806 /*[clinic input] 13807 str.__format__ as unicode___format__ 13808 13809 format_spec: unicode 13810 / 13811 13812 Return a formatted version of the string as described by format_spec. 13813 [clinic start generated code]*/ 13814 13815 static PyObject * 13816 unicode___format___impl(PyObject *self, PyObject *format_spec) 13817 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/ 13818 { 13819 _PyUnicodeWriter writer; 13820 int ret; 13821 13822 if (PyUnicode_READY(self) == -1) 13823 return NULL; 13824 _PyUnicodeWriter_Init(&writer); 13825 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13826 self, format_spec, 0, 13827 PyUnicode_GET_LENGTH(format_spec)); 13828 if (ret == -1) { 13829 _PyUnicodeWriter_Dealloc(&writer); 13830 return NULL; 13831 } 13832 return _PyUnicodeWriter_Finish(&writer); 13833 } 13834 13835 /*[clinic input] 13836 str.__sizeof__ as unicode_sizeof 13837 13838 Return the size of the string in memory, in bytes. 13839 [clinic start generated code]*/ 13840 13841 static PyObject * 13842 unicode_sizeof_impl(PyObject *self) 13843 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/ 13844 { 13845 Py_ssize_t size; 13846 13847 /* If it's a compact object, account for base structure + 13848 character data. */ 13849 if (PyUnicode_IS_COMPACT_ASCII(self)) 13850 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1; 13851 else if (PyUnicode_IS_COMPACT(self)) 13852 size = sizeof(PyCompactUnicodeObject) + 13853 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self); 13854 else { 13855 /* If it is a two-block object, account for base object, and 13856 for character block if present. */ 13857 size = sizeof(PyUnicodeObject); 13858 if (_PyUnicode_DATA_ANY(self)) 13859 size += (PyUnicode_GET_LENGTH(self) + 1) * 13860 PyUnicode_KIND(self); 13861 } 13862 /* If the wstr pointer is present, account for it unless it is shared 13863 with the data pointer. Check if the data is not shared. */ 13864 if (_PyUnicode_HAS_WSTR_MEMORY(self)) 13865 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t); 13866 if (_PyUnicode_HAS_UTF8_MEMORY(self)) 13867 size += PyUnicode_UTF8_LENGTH(self) + 1; 13868 13869 return PyLong_FromSsize_t(size); 13870 } 13871 13872 static PyObject * 13873 unicode_getnewargs(PyObject *v) 13874 { 13875 PyObject *copy = _PyUnicode_Copy(v); 13876 if (!copy) 13877 return NULL; 13878 return Py_BuildValue("(N)", copy); 13879 } 13880 13881 static PyMethodDef unicode_methods[] = { 13882 UNICODE_ENCODE_METHODDEF 13883 UNICODE_REPLACE_METHODDEF 13884 UNICODE_SPLIT_METHODDEF 13885 UNICODE_RSPLIT_METHODDEF 13886 UNICODE_JOIN_METHODDEF 13887 UNICODE_CAPITALIZE_METHODDEF 13888 UNICODE_CASEFOLD_METHODDEF 13889 UNICODE_TITLE_METHODDEF 13890 UNICODE_CENTER_METHODDEF 13891 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13892 UNICODE_EXPANDTABS_METHODDEF 13893 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13894 UNICODE_PARTITION_METHODDEF 13895 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13896 UNICODE_LJUST_METHODDEF 13897 UNICODE_LOWER_METHODDEF 13898 UNICODE_LSTRIP_METHODDEF 13899 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13900 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13901 UNICODE_RJUST_METHODDEF 13902 UNICODE_RSTRIP_METHODDEF 13903 UNICODE_RPARTITION_METHODDEF 13904 UNICODE_SPLITLINES_METHODDEF 13905 UNICODE_STRIP_METHODDEF 13906 UNICODE_SWAPCASE_METHODDEF 13907 UNICODE_TRANSLATE_METHODDEF 13908 UNICODE_UPPER_METHODDEF 13909 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13910 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13911 UNICODE_ISASCII_METHODDEF 13912 UNICODE_ISLOWER_METHODDEF 13913 UNICODE_ISUPPER_METHODDEF 13914 UNICODE_ISTITLE_METHODDEF 13915 UNICODE_ISSPACE_METHODDEF 13916 UNICODE_ISDECIMAL_METHODDEF 13917 UNICODE_ISDIGIT_METHODDEF 13918 UNICODE_ISNUMERIC_METHODDEF 13919 UNICODE_ISALPHA_METHODDEF 13920 UNICODE_ISALNUM_METHODDEF 13921 UNICODE_ISIDENTIFIER_METHODDEF 13922 UNICODE_ISPRINTABLE_METHODDEF 13923 UNICODE_ZFILL_METHODDEF 13924 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13925 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13926 UNICODE___FORMAT___METHODDEF 13927 UNICODE_MAKETRANS_METHODDEF 13928 UNICODE_SIZEOF_METHODDEF 13929 #if 0 13930 /* These methods are just used for debugging the implementation. */ 13931 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13932 #endif 13933 13934 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13935 {NULL, NULL} 13936 }; 13937 13938 static PyObject * 13939 unicode_mod(PyObject *v, PyObject *w) 13940 { 13941 if (!PyUnicode_Check(v)) 13942 Py_RETURN_NOTIMPLEMENTED; 13943 return PyUnicode_Format(v, w); 13944 } 13945 13946 static PyNumberMethods unicode_as_number = { 13947 0, /*nb_add*/ 13948 0, /*nb_subtract*/ 13949 0, /*nb_multiply*/ 13950 unicode_mod, /*nb_remainder*/ 13951 }; 13952 13953 static PySequenceMethods unicode_as_sequence = { 13954 (lenfunc) unicode_length, /* sq_length */ 13955 PyUnicode_Concat, /* sq_concat */ 13956 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13957 (ssizeargfunc) unicode_getitem, /* sq_item */ 13958 0, /* sq_slice */ 13959 0, /* sq_ass_item */ 13960 0, /* sq_ass_slice */ 13961 PyUnicode_Contains, /* sq_contains */ 13962 }; 13963 13964 static PyObject* 13965 unicode_subscript(PyObject* self, PyObject* item) 13966 { 13967 if (PyUnicode_READY(self) == -1) 13968 return NULL; 13969 13970 if (PyIndex_Check(item)) { 13971 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13972 if (i == -1 && PyErr_Occurred()) 13973 return NULL; 13974 if (i < 0) 13975 i += PyUnicode_GET_LENGTH(self); 13976 return unicode_getitem(self, i); 13977 } else if (PySlice_Check(item)) { 13978 Py_ssize_t start, stop, step, slicelength, cur, i; 13979 PyObject *result; 13980 void *src_data, *dest_data; 13981 int src_kind, dest_kind; 13982 Py_UCS4 ch, max_char, kind_limit; 13983 13984 if (PySlice_Unpack(item, &start, &stop, &step) < 0) { 13985 return NULL; 13986 } 13987 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self), 13988 &start, &stop, step); 13989 13990 if (slicelength <= 0) { 13991 _Py_RETURN_UNICODE_EMPTY(); 13992 } else if (start == 0 && step == 1 && 13993 slicelength == PyUnicode_GET_LENGTH(self)) { 13994 return unicode_result_unchanged(self); 13995 } else if (step == 1) { 13996 return PyUnicode_Substring(self, 13997 start, start + slicelength); 13998 } 13999 /* General case */ 14000 src_kind = PyUnicode_KIND(self); 14001 src_data = PyUnicode_DATA(self); 14002 if (!PyUnicode_IS_ASCII(self)) { 14003 kind_limit = kind_maxchar_limit(src_kind); 14004 max_char = 0; 14005 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14006 ch = PyUnicode_READ(src_kind, src_data, cur); 14007 if (ch > max_char) { 14008 max_char = ch; 14009 if (max_char >= kind_limit) 14010 break; 14011 } 14012 } 14013 } 14014 else 14015 max_char = 127; 14016 result = PyUnicode_New(slicelength, max_char); 14017 if (result == NULL) 14018 return NULL; 14019 dest_kind = PyUnicode_KIND(result); 14020 dest_data = PyUnicode_DATA(result); 14021 14022 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 14023 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 14024 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 14025 } 14026 assert(_PyUnicode_CheckConsistency(result, 1)); 14027 return result; 14028 } else { 14029 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 14030 return NULL; 14031 } 14032 } 14033 14034 static PyMappingMethods unicode_as_mapping = { 14035 (lenfunc)unicode_length, /* mp_length */ 14036 (binaryfunc)unicode_subscript, /* mp_subscript */ 14037 (objobjargproc)0, /* mp_ass_subscript */ 14038 }; 14039 14040 14041 /* Helpers for PyUnicode_Format() */ 14042 14043 struct unicode_formatter_t { 14044 PyObject *args; 14045 int args_owned; 14046 Py_ssize_t arglen, argidx; 14047 PyObject *dict; 14048 14049 enum PyUnicode_Kind fmtkind; 14050 Py_ssize_t fmtcnt, fmtpos; 14051 void *fmtdata; 14052 PyObject *fmtstr; 14053 14054 _PyUnicodeWriter writer; 14055 }; 14056 14057 struct unicode_format_arg_t { 14058 Py_UCS4 ch; 14059 int flags; 14060 Py_ssize_t width; 14061 int prec; 14062 int sign; 14063 }; 14064 14065 static PyObject * 14066 unicode_format_getnextarg(struct unicode_formatter_t *ctx) 14067 { 14068 Py_ssize_t argidx = ctx->argidx; 14069 14070 if (argidx < ctx->arglen) { 14071 ctx->argidx++; 14072 if (ctx->arglen < 0) 14073 return ctx->args; 14074 else 14075 return PyTuple_GetItem(ctx->args, argidx); 14076 } 14077 PyErr_SetString(PyExc_TypeError, 14078 "not enough arguments for format string"); 14079 return NULL; 14080 } 14081 14082 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 14083 14084 /* Format a float into the writer if the writer is not NULL, or into *p_output 14085 otherwise. 14086 14087 Return 0 on success, raise an exception and return -1 on error. */ 14088 static int 14089 formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 14090 PyObject **p_output, 14091 _PyUnicodeWriter *writer) 14092 { 14093 char *p; 14094 double x; 14095 Py_ssize_t len; 14096 int prec; 14097 int dtoa_flags; 14098 14099 x = PyFloat_AsDouble(v); 14100 if (x == -1.0 && PyErr_Occurred()) 14101 return -1; 14102 14103 prec = arg->prec; 14104 if (prec < 0) 14105 prec = 6; 14106 14107 if (arg->flags & F_ALT) 14108 dtoa_flags = Py_DTSF_ALT; 14109 else 14110 dtoa_flags = 0; 14111 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 14112 if (p == NULL) 14113 return -1; 14114 len = strlen(p); 14115 if (writer) { 14116 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 14117 PyMem_Free(p); 14118 return -1; 14119 } 14120 } 14121 else 14122 *p_output = _PyUnicode_FromASCII(p, len); 14123 PyMem_Free(p); 14124 return 0; 14125 } 14126 14127 /* formatlong() emulates the format codes d, u, o, x and X, and 14128 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 14129 * Python's regular ints. 14130 * Return value: a new PyUnicodeObject*, or NULL if error. 14131 * The output string is of the form 14132 * "-"? ("0x" | "0X")? digit+ 14133 * "0x"/"0X" are present only for x and X conversions, with F_ALT 14134 * set in flags. The case of hex digits will be correct, 14135 * There will be at least prec digits, zero-filled on the left if 14136 * necessary to get that many. 14137 * val object to be converted 14138 * flags bitmask of format flags; only F_ALT is looked at 14139 * prec minimum number of digits; 0-fill on left if needed 14140 * type a character in [duoxX]; u acts the same as d 14141 * 14142 * CAUTION: o, x and X conversions on regular ints can never 14143 * produce a '-' sign, but can for Python's unbounded ints. 14144 */ 14145 PyObject * 14146 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 14147 { 14148 PyObject *result = NULL; 14149 char *buf; 14150 Py_ssize_t i; 14151 int sign; /* 1 if '-', else 0 */ 14152 int len; /* number of characters */ 14153 Py_ssize_t llen; 14154 int numdigits; /* len == numnondigits + numdigits */ 14155 int numnondigits = 0; 14156 14157 /* Avoid exceeding SSIZE_T_MAX */ 14158 if (prec > INT_MAX-3) { 14159 PyErr_SetString(PyExc_OverflowError, 14160 "precision too large"); 14161 return NULL; 14162 } 14163 14164 assert(PyLong_Check(val)); 14165 14166 switch (type) { 14167 default: 14168 Py_UNREACHABLE(); 14169 case 'd': 14170 case 'i': 14171 case 'u': 14172 /* int and int subclasses should print numerically when a numeric */ 14173 /* format code is used (see issue18780) */ 14174 result = PyNumber_ToBase(val, 10); 14175 break; 14176 case 'o': 14177 numnondigits = 2; 14178 result = PyNumber_ToBase(val, 8); 14179 break; 14180 case 'x': 14181 case 'X': 14182 numnondigits = 2; 14183 result = PyNumber_ToBase(val, 16); 14184 break; 14185 } 14186 if (!result) 14187 return NULL; 14188 14189 assert(unicode_modifiable(result)); 14190 assert(PyUnicode_IS_READY(result)); 14191 assert(PyUnicode_IS_ASCII(result)); 14192 14193 /* To modify the string in-place, there can only be one reference. */ 14194 if (Py_REFCNT(result) != 1) { 14195 Py_DECREF(result); 14196 PyErr_BadInternalCall(); 14197 return NULL; 14198 } 14199 buf = PyUnicode_DATA(result); 14200 llen = PyUnicode_GET_LENGTH(result); 14201 if (llen > INT_MAX) { 14202 Py_DECREF(result); 14203 PyErr_SetString(PyExc_ValueError, 14204 "string too large in _PyUnicode_FormatLong"); 14205 return NULL; 14206 } 14207 len = (int)llen; 14208 sign = buf[0] == '-'; 14209 numnondigits += sign; 14210 numdigits = len - numnondigits; 14211 assert(numdigits > 0); 14212 14213 /* Get rid of base marker unless F_ALT */ 14214 if (((alt) == 0 && 14215 (type == 'o' || type == 'x' || type == 'X'))) { 14216 assert(buf[sign] == '0'); 14217 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14218 buf[sign+1] == 'o'); 14219 numnondigits -= 2; 14220 buf += 2; 14221 len -= 2; 14222 if (sign) 14223 buf[0] = '-'; 14224 assert(len == numnondigits + numdigits); 14225 assert(numdigits > 0); 14226 } 14227 14228 /* Fill with leading zeroes to meet minimum width. */ 14229 if (prec > numdigits) { 14230 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14231 numnondigits + prec); 14232 char *b1; 14233 if (!r1) { 14234 Py_DECREF(result); 14235 return NULL; 14236 } 14237 b1 = PyBytes_AS_STRING(r1); 14238 for (i = 0; i < numnondigits; ++i) 14239 *b1++ = *buf++; 14240 for (i = 0; i < prec - numdigits; i++) 14241 *b1++ = '0'; 14242 for (i = 0; i < numdigits; i++) 14243 *b1++ = *buf++; 14244 *b1 = '\0'; 14245 Py_DECREF(result); 14246 result = r1; 14247 buf = PyBytes_AS_STRING(result); 14248 len = numnondigits + prec; 14249 } 14250 14251 /* Fix up case for hex conversions. */ 14252 if (type == 'X') { 14253 /* Need to convert all lower case letters to upper case. 14254 and need to convert 0x to 0X (and -0x to -0X). */ 14255 for (i = 0; i < len; i++) 14256 if (buf[i] >= 'a' && buf[i] <= 'x') 14257 buf[i] -= 'a'-'A'; 14258 } 14259 if (!PyUnicode_Check(result) 14260 || buf != PyUnicode_DATA(result)) { 14261 PyObject *unicode; 14262 unicode = _PyUnicode_FromASCII(buf, len); 14263 Py_DECREF(result); 14264 result = unicode; 14265 } 14266 else if (len != PyUnicode_GET_LENGTH(result)) { 14267 if (PyUnicode_Resize(&result, len) < 0) 14268 Py_CLEAR(result); 14269 } 14270 return result; 14271 } 14272 14273 /* Format an integer or a float as an integer. 14274 * Return 1 if the number has been formatted into the writer, 14275 * 0 if the number has been formatted into *p_output 14276 * -1 and raise an exception on error */ 14277 static int 14278 mainformatlong(PyObject *v, 14279 struct unicode_format_arg_t *arg, 14280 PyObject **p_output, 14281 _PyUnicodeWriter *writer) 14282 { 14283 PyObject *iobj, *res; 14284 char type = (char)arg->ch; 14285 14286 if (!PyNumber_Check(v)) 14287 goto wrongtype; 14288 14289 /* make sure number is a type of integer for o, x, and X */ 14290 if (!PyLong_Check(v)) { 14291 if (type == 'o' || type == 'x' || type == 'X') { 14292 iobj = PyNumber_Index(v); 14293 if (iobj == NULL) { 14294 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14295 goto wrongtype; 14296 return -1; 14297 } 14298 } 14299 else { 14300 iobj = PyNumber_Long(v); 14301 if (iobj == NULL ) { 14302 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14303 goto wrongtype; 14304 return -1; 14305 } 14306 } 14307 assert(PyLong_Check(iobj)); 14308 } 14309 else { 14310 iobj = v; 14311 Py_INCREF(iobj); 14312 } 14313 14314 if (PyLong_CheckExact(v) 14315 && arg->width == -1 && arg->prec == -1 14316 && !(arg->flags & (F_SIGN | F_BLANK)) 14317 && type != 'X') 14318 { 14319 /* Fast path */ 14320 int alternate = arg->flags & F_ALT; 14321 int base; 14322 14323 switch(type) 14324 { 14325 default: 14326 Py_UNREACHABLE(); 14327 case 'd': 14328 case 'i': 14329 case 'u': 14330 base = 10; 14331 break; 14332 case 'o': 14333 base = 8; 14334 break; 14335 case 'x': 14336 case 'X': 14337 base = 16; 14338 break; 14339 } 14340 14341 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14342 Py_DECREF(iobj); 14343 return -1; 14344 } 14345 Py_DECREF(iobj); 14346 return 1; 14347 } 14348 14349 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14350 Py_DECREF(iobj); 14351 if (res == NULL) 14352 return -1; 14353 *p_output = res; 14354 return 0; 14355 14356 wrongtype: 14357 switch(type) 14358 { 14359 case 'o': 14360 case 'x': 14361 case 'X': 14362 PyErr_Format(PyExc_TypeError, 14363 "%%%c format: an integer is required, " 14364 "not %.200s", 14365 type, Py_TYPE(v)->tp_name); 14366 break; 14367 default: 14368 PyErr_Format(PyExc_TypeError, 14369 "%%%c format: a number is required, " 14370 "not %.200s", 14371 type, Py_TYPE(v)->tp_name); 14372 break; 14373 } 14374 return -1; 14375 } 14376 14377 static Py_UCS4 14378 formatchar(PyObject *v) 14379 { 14380 /* presume that the buffer is at least 3 characters long */ 14381 if (PyUnicode_Check(v)) { 14382 if (PyUnicode_GET_LENGTH(v) == 1) { 14383 return PyUnicode_READ_CHAR(v, 0); 14384 } 14385 goto onError; 14386 } 14387 else { 14388 PyObject *iobj; 14389 long x; 14390 /* make sure number is a type of integer */ 14391 if (!PyLong_Check(v)) { 14392 iobj = PyNumber_Index(v); 14393 if (iobj == NULL) { 14394 goto onError; 14395 } 14396 x = PyLong_AsLong(iobj); 14397 Py_DECREF(iobj); 14398 } 14399 else { 14400 x = PyLong_AsLong(v); 14401 } 14402 if (x == -1 && PyErr_Occurred()) 14403 goto onError; 14404 14405 if (x < 0 || x > MAX_UNICODE) { 14406 PyErr_SetString(PyExc_OverflowError, 14407 "%c arg not in range(0x110000)"); 14408 return (Py_UCS4) -1; 14409 } 14410 14411 return (Py_UCS4) x; 14412 } 14413 14414 onError: 14415 PyErr_SetString(PyExc_TypeError, 14416 "%c requires int or char"); 14417 return (Py_UCS4) -1; 14418 } 14419 14420 /* Parse options of an argument: flags, width, precision. 14421 Handle also "%(name)" syntax. 14422 14423 Return 0 if the argument has been formatted into arg->str. 14424 Return 1 if the argument has been written into ctx->writer, 14425 Raise an exception and return -1 on error. */ 14426 static int 14427 unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14428 struct unicode_format_arg_t *arg) 14429 { 14430 #define FORMAT_READ(ctx) \ 14431 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14432 14433 PyObject *v; 14434 14435 if (arg->ch == '(') { 14436 /* Get argument value from a dictionary. Example: "%(name)s". */ 14437 Py_ssize_t keystart; 14438 Py_ssize_t keylen; 14439 PyObject *key; 14440 int pcount = 1; 14441 14442 if (ctx->dict == NULL) { 14443 PyErr_SetString(PyExc_TypeError, 14444 "format requires a mapping"); 14445 return -1; 14446 } 14447 ++ctx->fmtpos; 14448 --ctx->fmtcnt; 14449 keystart = ctx->fmtpos; 14450 /* Skip over balanced parentheses */ 14451 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14452 arg->ch = FORMAT_READ(ctx); 14453 if (arg->ch == ')') 14454 --pcount; 14455 else if (arg->ch == '(') 14456 ++pcount; 14457 ctx->fmtpos++; 14458 } 14459 keylen = ctx->fmtpos - keystart - 1; 14460 if (ctx->fmtcnt < 0 || pcount > 0) { 14461 PyErr_SetString(PyExc_ValueError, 14462 "incomplete format key"); 14463 return -1; 14464 } 14465 key = PyUnicode_Substring(ctx->fmtstr, 14466 keystart, keystart + keylen); 14467 if (key == NULL) 14468 return -1; 14469 if (ctx->args_owned) { 14470 ctx->args_owned = 0; 14471 Py_DECREF(ctx->args); 14472 } 14473 ctx->args = PyObject_GetItem(ctx->dict, key); 14474 Py_DECREF(key); 14475 if (ctx->args == NULL) 14476 return -1; 14477 ctx->args_owned = 1; 14478 ctx->arglen = -1; 14479 ctx->argidx = -2; 14480 } 14481 14482 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14483 while (--ctx->fmtcnt >= 0) { 14484 arg->ch = FORMAT_READ(ctx); 14485 ctx->fmtpos++; 14486 switch (arg->ch) { 14487 case '-': arg->flags |= F_LJUST; continue; 14488 case '+': arg->flags |= F_SIGN; continue; 14489 case ' ': arg->flags |= F_BLANK; continue; 14490 case '#': arg->flags |= F_ALT; continue; 14491 case '0': arg->flags |= F_ZERO; continue; 14492 } 14493 break; 14494 } 14495 14496 /* Parse width. Example: "%10s" => width=10 */ 14497 if (arg->ch == '*') { 14498 v = unicode_format_getnextarg(ctx); 14499 if (v == NULL) 14500 return -1; 14501 if (!PyLong_Check(v)) { 14502 PyErr_SetString(PyExc_TypeError, 14503 "* wants int"); 14504 return -1; 14505 } 14506 arg->width = PyLong_AsSsize_t(v); 14507 if (arg->width == -1 && PyErr_Occurred()) 14508 return -1; 14509 if (arg->width < 0) { 14510 arg->flags |= F_LJUST; 14511 arg->width = -arg->width; 14512 } 14513 if (--ctx->fmtcnt >= 0) { 14514 arg->ch = FORMAT_READ(ctx); 14515 ctx->fmtpos++; 14516 } 14517 } 14518 else if (arg->ch >= '0' && arg->ch <= '9') { 14519 arg->width = arg->ch - '0'; 14520 while (--ctx->fmtcnt >= 0) { 14521 arg->ch = FORMAT_READ(ctx); 14522 ctx->fmtpos++; 14523 if (arg->ch < '0' || arg->ch > '9') 14524 break; 14525 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14526 mixing signed and unsigned comparison. Since arg->ch is between 14527 '0' and '9', casting to int is safe. */ 14528 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14529 PyErr_SetString(PyExc_ValueError, 14530 "width too big"); 14531 return -1; 14532 } 14533 arg->width = arg->width*10 + (arg->ch - '0'); 14534 } 14535 } 14536 14537 /* Parse precision. Example: "%.3f" => prec=3 */ 14538 if (arg->ch == '.') { 14539 arg->prec = 0; 14540 if (--ctx->fmtcnt >= 0) { 14541 arg->ch = FORMAT_READ(ctx); 14542 ctx->fmtpos++; 14543 } 14544 if (arg->ch == '*') { 14545 v = unicode_format_getnextarg(ctx); 14546 if (v == NULL) 14547 return -1; 14548 if (!PyLong_Check(v)) { 14549 PyErr_SetString(PyExc_TypeError, 14550 "* wants int"); 14551 return -1; 14552 } 14553 arg->prec = _PyLong_AsInt(v); 14554 if (arg->prec == -1 && PyErr_Occurred()) 14555 return -1; 14556 if (arg->prec < 0) 14557 arg->prec = 0; 14558 if (--ctx->fmtcnt >= 0) { 14559 arg->ch = FORMAT_READ(ctx); 14560 ctx->fmtpos++; 14561 } 14562 } 14563 else if (arg->ch >= '0' && arg->ch <= '9') { 14564 arg->prec = arg->ch - '0'; 14565 while (--ctx->fmtcnt >= 0) { 14566 arg->ch = FORMAT_READ(ctx); 14567 ctx->fmtpos++; 14568 if (arg->ch < '0' || arg->ch > '9') 14569 break; 14570 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14571 PyErr_SetString(PyExc_ValueError, 14572 "precision too big"); 14573 return -1; 14574 } 14575 arg->prec = arg->prec*10 + (arg->ch - '0'); 14576 } 14577 } 14578 } 14579 14580 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14581 if (ctx->fmtcnt >= 0) { 14582 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14583 if (--ctx->fmtcnt >= 0) { 14584 arg->ch = FORMAT_READ(ctx); 14585 ctx->fmtpos++; 14586 } 14587 } 14588 } 14589 if (ctx->fmtcnt < 0) { 14590 PyErr_SetString(PyExc_ValueError, 14591 "incomplete format"); 14592 return -1; 14593 } 14594 return 0; 14595 14596 #undef FORMAT_READ 14597 } 14598 14599 /* Format one argument. Supported conversion specifiers: 14600 14601 - "s", "r", "a": any type 14602 - "i", "d", "u": int or float 14603 - "o", "x", "X": int 14604 - "e", "E", "f", "F", "g", "G": float 14605 - "c": int or str (1 character) 14606 14607 When possible, the output is written directly into the Unicode writer 14608 (ctx->writer). A string is created when padding is required. 14609 14610 Return 0 if the argument has been formatted into *p_str, 14611 1 if the argument has been written into ctx->writer, 14612 -1 on error. */ 14613 static int 14614 unicode_format_arg_format(struct unicode_formatter_t *ctx, 14615 struct unicode_format_arg_t *arg, 14616 PyObject **p_str) 14617 { 14618 PyObject *v; 14619 _PyUnicodeWriter *writer = &ctx->writer; 14620 14621 if (ctx->fmtcnt == 0) 14622 ctx->writer.overallocate = 0; 14623 14624 v = unicode_format_getnextarg(ctx); 14625 if (v == NULL) 14626 return -1; 14627 14628 14629 switch (arg->ch) { 14630 case 's': 14631 case 'r': 14632 case 'a': 14633 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14634 /* Fast path */ 14635 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14636 return -1; 14637 return 1; 14638 } 14639 14640 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14641 *p_str = v; 14642 Py_INCREF(*p_str); 14643 } 14644 else { 14645 if (arg->ch == 's') 14646 *p_str = PyObject_Str(v); 14647 else if (arg->ch == 'r') 14648 *p_str = PyObject_Repr(v); 14649 else 14650 *p_str = PyObject_ASCII(v); 14651 } 14652 break; 14653 14654 case 'i': 14655 case 'd': 14656 case 'u': 14657 case 'o': 14658 case 'x': 14659 case 'X': 14660 { 14661 int ret = mainformatlong(v, arg, p_str, writer); 14662 if (ret != 0) 14663 return ret; 14664 arg->sign = 1; 14665 break; 14666 } 14667 14668 case 'e': 14669 case 'E': 14670 case 'f': 14671 case 'F': 14672 case 'g': 14673 case 'G': 14674 if (arg->width == -1 && arg->prec == -1 14675 && !(arg->flags & (F_SIGN | F_BLANK))) 14676 { 14677 /* Fast path */ 14678 if (formatfloat(v, arg, NULL, writer) == -1) 14679 return -1; 14680 return 1; 14681 } 14682 14683 arg->sign = 1; 14684 if (formatfloat(v, arg, p_str, NULL) == -1) 14685 return -1; 14686 break; 14687 14688 case 'c': 14689 { 14690 Py_UCS4 ch = formatchar(v); 14691 if (ch == (Py_UCS4) -1) 14692 return -1; 14693 if (arg->width == -1 && arg->prec == -1) { 14694 /* Fast path */ 14695 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14696 return -1; 14697 return 1; 14698 } 14699 *p_str = PyUnicode_FromOrdinal(ch); 14700 break; 14701 } 14702 14703 default: 14704 PyErr_Format(PyExc_ValueError, 14705 "unsupported format character '%c' (0x%x) " 14706 "at index %zd", 14707 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14708 (int)arg->ch, 14709 ctx->fmtpos - 1); 14710 return -1; 14711 } 14712 if (*p_str == NULL) 14713 return -1; 14714 assert (PyUnicode_Check(*p_str)); 14715 return 0; 14716 } 14717 14718 static int 14719 unicode_format_arg_output(struct unicode_formatter_t *ctx, 14720 struct unicode_format_arg_t *arg, 14721 PyObject *str) 14722 { 14723 Py_ssize_t len; 14724 enum PyUnicode_Kind kind; 14725 void *pbuf; 14726 Py_ssize_t pindex; 14727 Py_UCS4 signchar; 14728 Py_ssize_t buflen; 14729 Py_UCS4 maxchar; 14730 Py_ssize_t sublen; 14731 _PyUnicodeWriter *writer = &ctx->writer; 14732 Py_UCS4 fill; 14733 14734 fill = ' '; 14735 if (arg->sign && arg->flags & F_ZERO) 14736 fill = '0'; 14737 14738 if (PyUnicode_READY(str) == -1) 14739 return -1; 14740 14741 len = PyUnicode_GET_LENGTH(str); 14742 if ((arg->width == -1 || arg->width <= len) 14743 && (arg->prec == -1 || arg->prec >= len) 14744 && !(arg->flags & (F_SIGN | F_BLANK))) 14745 { 14746 /* Fast path */ 14747 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14748 return -1; 14749 return 0; 14750 } 14751 14752 /* Truncate the string for "s", "r" and "a" formats 14753 if the precision is set */ 14754 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14755 if (arg->prec >= 0 && len > arg->prec) 14756 len = arg->prec; 14757 } 14758 14759 /* Adjust sign and width */ 14760 kind = PyUnicode_KIND(str); 14761 pbuf = PyUnicode_DATA(str); 14762 pindex = 0; 14763 signchar = '\0'; 14764 if (arg->sign) { 14765 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14766 if (ch == '-' || ch == '+') { 14767 signchar = ch; 14768 len--; 14769 pindex++; 14770 } 14771 else if (arg->flags & F_SIGN) 14772 signchar = '+'; 14773 else if (arg->flags & F_BLANK) 14774 signchar = ' '; 14775 else 14776 arg->sign = 0; 14777 } 14778 if (arg->width < len) 14779 arg->width = len; 14780 14781 /* Prepare the writer */ 14782 maxchar = writer->maxchar; 14783 if (!(arg->flags & F_LJUST)) { 14784 if (arg->sign) { 14785 if ((arg->width-1) > len) 14786 maxchar = Py_MAX(maxchar, fill); 14787 } 14788 else { 14789 if (arg->width > len) 14790 maxchar = Py_MAX(maxchar, fill); 14791 } 14792 } 14793 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14794 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14795 maxchar = Py_MAX(maxchar, strmaxchar); 14796 } 14797 14798 buflen = arg->width; 14799 if (arg->sign && len == arg->width) 14800 buflen++; 14801 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14802 return -1; 14803 14804 /* Write the sign if needed */ 14805 if (arg->sign) { 14806 if (fill != ' ') { 14807 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14808 writer->pos += 1; 14809 } 14810 if (arg->width > len) 14811 arg->width--; 14812 } 14813 14814 /* Write the numeric prefix for "x", "X" and "o" formats 14815 if the alternate form is used. 14816 For example, write "0x" for the "%#x" format. */ 14817 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14818 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14819 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14820 if (fill != ' ') { 14821 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14822 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14823 writer->pos += 2; 14824 pindex += 2; 14825 } 14826 arg->width -= 2; 14827 if (arg->width < 0) 14828 arg->width = 0; 14829 len -= 2; 14830 } 14831 14832 /* Pad left with the fill character if needed */ 14833 if (arg->width > len && !(arg->flags & F_LJUST)) { 14834 sublen = arg->width - len; 14835 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14836 writer->pos += sublen; 14837 arg->width = len; 14838 } 14839 14840 /* If padding with spaces: write sign if needed and/or numeric prefix if 14841 the alternate form is used */ 14842 if (fill == ' ') { 14843 if (arg->sign) { 14844 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14845 writer->pos += 1; 14846 } 14847 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14848 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14849 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14850 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14851 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14852 writer->pos += 2; 14853 pindex += 2; 14854 } 14855 } 14856 14857 /* Write characters */ 14858 if (len) { 14859 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14860 str, pindex, len); 14861 writer->pos += len; 14862 } 14863 14864 /* Pad right with the fill character if needed */ 14865 if (arg->width > len) { 14866 sublen = arg->width - len; 14867 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14868 writer->pos += sublen; 14869 } 14870 return 0; 14871 } 14872 14873 /* Helper of PyUnicode_Format(): format one arg. 14874 Return 0 on success, raise an exception and return -1 on error. */ 14875 static int 14876 unicode_format_arg(struct unicode_formatter_t *ctx) 14877 { 14878 struct unicode_format_arg_t arg; 14879 PyObject *str; 14880 int ret; 14881 14882 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14883 if (arg.ch == '%') { 14884 ctx->fmtpos++; 14885 ctx->fmtcnt--; 14886 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0) 14887 return -1; 14888 return 0; 14889 } 14890 arg.flags = 0; 14891 arg.width = -1; 14892 arg.prec = -1; 14893 arg.sign = 0; 14894 str = NULL; 14895 14896 ret = unicode_format_arg_parse(ctx, &arg); 14897 if (ret == -1) 14898 return -1; 14899 14900 ret = unicode_format_arg_format(ctx, &arg, &str); 14901 if (ret == -1) 14902 return -1; 14903 14904 if (ret != 1) { 14905 ret = unicode_format_arg_output(ctx, &arg, str); 14906 Py_DECREF(str); 14907 if (ret == -1) 14908 return -1; 14909 } 14910 14911 if (ctx->dict && (ctx->argidx < ctx->arglen)) { 14912 PyErr_SetString(PyExc_TypeError, 14913 "not all arguments converted during string formatting"); 14914 return -1; 14915 } 14916 return 0; 14917 } 14918 14919 PyObject * 14920 PyUnicode_Format(PyObject *format, PyObject *args) 14921 { 14922 struct unicode_formatter_t ctx; 14923 14924 if (format == NULL || args == NULL) { 14925 PyErr_BadInternalCall(); 14926 return NULL; 14927 } 14928 14929 if (ensure_unicode(format) < 0) 14930 return NULL; 14931 14932 ctx.fmtstr = format; 14933 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14934 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14935 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14936 ctx.fmtpos = 0; 14937 14938 _PyUnicodeWriter_Init(&ctx.writer); 14939 ctx.writer.min_length = ctx.fmtcnt + 100; 14940 ctx.writer.overallocate = 1; 14941 14942 if (PyTuple_Check(args)) { 14943 ctx.arglen = PyTuple_Size(args); 14944 ctx.argidx = 0; 14945 } 14946 else { 14947 ctx.arglen = -1; 14948 ctx.argidx = -2; 14949 } 14950 ctx.args_owned = 0; 14951 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14952 ctx.dict = args; 14953 else 14954 ctx.dict = NULL; 14955 ctx.args = args; 14956 14957 while (--ctx.fmtcnt >= 0) { 14958 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14959 Py_ssize_t nonfmtpos; 14960 14961 nonfmtpos = ctx.fmtpos++; 14962 while (ctx.fmtcnt >= 0 && 14963 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14964 ctx.fmtpos++; 14965 ctx.fmtcnt--; 14966 } 14967 if (ctx.fmtcnt < 0) { 14968 ctx.fmtpos--; 14969 ctx.writer.overallocate = 0; 14970 } 14971 14972 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14973 nonfmtpos, ctx.fmtpos) < 0) 14974 goto onError; 14975 } 14976 else { 14977 ctx.fmtpos++; 14978 if (unicode_format_arg(&ctx) == -1) 14979 goto onError; 14980 } 14981 } 14982 14983 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14984 PyErr_SetString(PyExc_TypeError, 14985 "not all arguments converted during string formatting"); 14986 goto onError; 14987 } 14988 14989 if (ctx.args_owned) { 14990 Py_DECREF(ctx.args); 14991 } 14992 return _PyUnicodeWriter_Finish(&ctx.writer); 14993 14994 onError: 14995 _PyUnicodeWriter_Dealloc(&ctx.writer); 14996 if (ctx.args_owned) { 14997 Py_DECREF(ctx.args); 14998 } 14999 return NULL; 15000 } 15001 15002 static PyObject * 15003 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 15004 15005 static PyObject * 15006 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 15007 { 15008 PyObject *x = NULL; 15009 static char *kwlist[] = {"object", "encoding", "errors", 0}; 15010 char *encoding = NULL; 15011 char *errors = NULL; 15012 15013 if (type != &PyUnicode_Type) 15014 return unicode_subtype_new(type, args, kwds); 15015 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 15016 kwlist, &x, &encoding, &errors)) 15017 return NULL; 15018 if (x == NULL) 15019 _Py_RETURN_UNICODE_EMPTY(); 15020 if (encoding == NULL && errors == NULL) 15021 return PyObject_Str(x); 15022 else 15023 return PyUnicode_FromEncodedObject(x, encoding, errors); 15024 } 15025 15026 static PyObject * 15027 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 15028 { 15029 PyObject *unicode, *self; 15030 Py_ssize_t length, char_size; 15031 int share_wstr, share_utf8; 15032 unsigned int kind; 15033 void *data; 15034 15035 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 15036 15037 unicode = unicode_new(&PyUnicode_Type, args, kwds); 15038 if (unicode == NULL) 15039 return NULL; 15040 assert(_PyUnicode_CHECK(unicode)); 15041 if (PyUnicode_READY(unicode) == -1) { 15042 Py_DECREF(unicode); 15043 return NULL; 15044 } 15045 15046 self = type->tp_alloc(type, 0); 15047 if (self == NULL) { 15048 Py_DECREF(unicode); 15049 return NULL; 15050 } 15051 kind = PyUnicode_KIND(unicode); 15052 length = PyUnicode_GET_LENGTH(unicode); 15053 15054 _PyUnicode_LENGTH(self) = length; 15055 #ifdef Py_DEBUG 15056 _PyUnicode_HASH(self) = -1; 15057 #else 15058 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15059 #endif 15060 _PyUnicode_STATE(self).interned = 0; 15061 _PyUnicode_STATE(self).kind = kind; 15062 _PyUnicode_STATE(self).compact = 0; 15063 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 15064 _PyUnicode_STATE(self).ready = 1; 15065 _PyUnicode_WSTR(self) = NULL; 15066 _PyUnicode_UTF8_LENGTH(self) = 0; 15067 _PyUnicode_UTF8(self) = NULL; 15068 _PyUnicode_WSTR_LENGTH(self) = 0; 15069 _PyUnicode_DATA_ANY(self) = NULL; 15070 15071 share_utf8 = 0; 15072 share_wstr = 0; 15073 if (kind == PyUnicode_1BYTE_KIND) { 15074 char_size = 1; 15075 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 15076 share_utf8 = 1; 15077 } 15078 else if (kind == PyUnicode_2BYTE_KIND) { 15079 char_size = 2; 15080 if (sizeof(wchar_t) == 2) 15081 share_wstr = 1; 15082 } 15083 else { 15084 assert(kind == PyUnicode_4BYTE_KIND); 15085 char_size = 4; 15086 if (sizeof(wchar_t) == 4) 15087 share_wstr = 1; 15088 } 15089 15090 /* Ensure we won't overflow the length. */ 15091 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 15092 PyErr_NoMemory(); 15093 goto onError; 15094 } 15095 data = PyObject_MALLOC((length + 1) * char_size); 15096 if (data == NULL) { 15097 PyErr_NoMemory(); 15098 goto onError; 15099 } 15100 15101 _PyUnicode_DATA_ANY(self) = data; 15102 if (share_utf8) { 15103 _PyUnicode_UTF8_LENGTH(self) = length; 15104 _PyUnicode_UTF8(self) = data; 15105 } 15106 if (share_wstr) { 15107 _PyUnicode_WSTR_LENGTH(self) = length; 15108 _PyUnicode_WSTR(self) = (wchar_t *)data; 15109 } 15110 15111 memcpy(data, PyUnicode_DATA(unicode), 15112 kind * (length + 1)); 15113 assert(_PyUnicode_CheckConsistency(self, 1)); 15114 #ifdef Py_DEBUG 15115 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 15116 #endif 15117 Py_DECREF(unicode); 15118 return self; 15119 15120 onError: 15121 Py_DECREF(unicode); 15122 Py_DECREF(self); 15123 return NULL; 15124 } 15125 15126 PyDoc_STRVAR(unicode_doc, 15127 "str(object='') -> str\n\ 15128 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 15129 \n\ 15130 Create a new string object from the given object. If encoding or\n\ 15131 errors is specified, then the object must expose a data buffer\n\ 15132 that will be decoded using the given encoding and error handler.\n\ 15133 Otherwise, returns the result of object.__str__() (if defined)\n\ 15134 or repr(object).\n\ 15135 encoding defaults to sys.getdefaultencoding().\n\ 15136 errors defaults to 'strict'."); 15137 15138 static PyObject *unicode_iter(PyObject *seq); 15139 15140 PyTypeObject PyUnicode_Type = { 15141 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15142 "str", /* tp_name */ 15143 sizeof(PyUnicodeObject), /* tp_size */ 15144 0, /* tp_itemsize */ 15145 /* Slots */ 15146 (destructor)unicode_dealloc, /* tp_dealloc */ 15147 0, /* tp_print */ 15148 0, /* tp_getattr */ 15149 0, /* tp_setattr */ 15150 0, /* tp_reserved */ 15151 unicode_repr, /* tp_repr */ 15152 &unicode_as_number, /* tp_as_number */ 15153 &unicode_as_sequence, /* tp_as_sequence */ 15154 &unicode_as_mapping, /* tp_as_mapping */ 15155 (hashfunc) unicode_hash, /* tp_hash*/ 15156 0, /* tp_call*/ 15157 (reprfunc) unicode_str, /* tp_str */ 15158 PyObject_GenericGetAttr, /* tp_getattro */ 15159 0, /* tp_setattro */ 15160 0, /* tp_as_buffer */ 15161 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 15162 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 15163 unicode_doc, /* tp_doc */ 15164 0, /* tp_traverse */ 15165 0, /* tp_clear */ 15166 PyUnicode_RichCompare, /* tp_richcompare */ 15167 0, /* tp_weaklistoffset */ 15168 unicode_iter, /* tp_iter */ 15169 0, /* tp_iternext */ 15170 unicode_methods, /* tp_methods */ 15171 0, /* tp_members */ 15172 0, /* tp_getset */ 15173 &PyBaseObject_Type, /* tp_base */ 15174 0, /* tp_dict */ 15175 0, /* tp_descr_get */ 15176 0, /* tp_descr_set */ 15177 0, /* tp_dictoffset */ 15178 0, /* tp_init */ 15179 0, /* tp_alloc */ 15180 unicode_new, /* tp_new */ 15181 PyObject_Del, /* tp_free */ 15182 }; 15183 15184 /* Initialize the Unicode implementation */ 15185 15186 int _PyUnicode_Init(void) 15187 { 15188 /* XXX - move this array to unicodectype.c ? */ 15189 Py_UCS2 linebreak[] = { 15190 0x000A, /* LINE FEED */ 15191 0x000D, /* CARRIAGE RETURN */ 15192 0x001C, /* FILE SEPARATOR */ 15193 0x001D, /* GROUP SEPARATOR */ 15194 0x001E, /* RECORD SEPARATOR */ 15195 0x0085, /* NEXT LINE */ 15196 0x2028, /* LINE SEPARATOR */ 15197 0x2029, /* PARAGRAPH SEPARATOR */ 15198 }; 15199 15200 /* Init the implementation */ 15201 _Py_INCREF_UNICODE_EMPTY(); 15202 if (!unicode_empty) 15203 Py_FatalError("Can't create empty string"); 15204 Py_DECREF(unicode_empty); 15205 15206 if (PyType_Ready(&PyUnicode_Type) < 0) 15207 Py_FatalError("Can't initialize 'unicode'"); 15208 15209 /* initialize the linebreak bloom filter */ 15210 bloom_linebreak = make_bloom_mask( 15211 PyUnicode_2BYTE_KIND, linebreak, 15212 Py_ARRAY_LENGTH(linebreak)); 15213 15214 if (PyType_Ready(&EncodingMapType) < 0) 15215 Py_FatalError("Can't initialize encoding map type"); 15216 15217 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15218 Py_FatalError("Can't initialize field name iterator type"); 15219 15220 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15221 Py_FatalError("Can't initialize formatter iter type"); 15222 15223 return 0; 15224 } 15225 15226 /* Finalize the Unicode implementation */ 15227 15228 int 15229 PyUnicode_ClearFreeList(void) 15230 { 15231 return 0; 15232 } 15233 15234 void 15235 _PyUnicode_Fini(void) 15236 { 15237 int i; 15238 15239 Py_CLEAR(unicode_empty); 15240 15241 for (i = 0; i < 256; i++) 15242 Py_CLEAR(unicode_latin1[i]); 15243 _PyUnicode_ClearStaticStrings(); 15244 (void)PyUnicode_ClearFreeList(); 15245 } 15246 15247 void 15248 PyUnicode_InternInPlace(PyObject **p) 15249 { 15250 PyObject *s = *p; 15251 PyObject *t; 15252 #ifdef Py_DEBUG 15253 assert(s != NULL); 15254 assert(_PyUnicode_CHECK(s)); 15255 #else 15256 if (s == NULL || !PyUnicode_Check(s)) 15257 return; 15258 #endif 15259 /* If it's a subclass, we don't really know what putting 15260 it in the interned dict might do. */ 15261 if (!PyUnicode_CheckExact(s)) 15262 return; 15263 if (PyUnicode_CHECK_INTERNED(s)) 15264 return; 15265 if (interned == NULL) { 15266 interned = PyDict_New(); 15267 if (interned == NULL) { 15268 PyErr_Clear(); /* Don't leave an exception */ 15269 return; 15270 } 15271 } 15272 Py_ALLOW_RECURSION 15273 t = PyDict_SetDefault(interned, s, s); 15274 Py_END_ALLOW_RECURSION 15275 if (t == NULL) { 15276 PyErr_Clear(); 15277 return; 15278 } 15279 if (t != s) { 15280 Py_INCREF(t); 15281 Py_SETREF(*p, t); 15282 return; 15283 } 15284 /* The two references in interned are not counted by refcnt. 15285 The deallocator will take care of this */ 15286 Py_REFCNT(s) -= 2; 15287 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15288 } 15289 15290 void 15291 PyUnicode_InternImmortal(PyObject **p) 15292 { 15293 PyUnicode_InternInPlace(p); 15294 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15295 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15296 Py_INCREF(*p); 15297 } 15298 } 15299 15300 PyObject * 15301 PyUnicode_InternFromString(const char *cp) 15302 { 15303 PyObject *s = PyUnicode_FromString(cp); 15304 if (s == NULL) 15305 return NULL; 15306 PyUnicode_InternInPlace(&s); 15307 return s; 15308 } 15309 15310 void 15311 _Py_ReleaseInternedUnicodeStrings(void) 15312 { 15313 PyObject *keys; 15314 PyObject *s; 15315 Py_ssize_t i, n; 15316 Py_ssize_t immortal_size = 0, mortal_size = 0; 15317 15318 if (interned == NULL || !PyDict_Check(interned)) 15319 return; 15320 keys = PyDict_Keys(interned); 15321 if (keys == NULL || !PyList_Check(keys)) { 15322 PyErr_Clear(); 15323 return; 15324 } 15325 15326 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15327 detector, interned unicode strings are not forcibly deallocated; 15328 rather, we give them their stolen references back, and then clear 15329 and DECREF the interned dict. */ 15330 15331 n = PyList_GET_SIZE(keys); 15332 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15333 n); 15334 for (i = 0; i < n; i++) { 15335 s = PyList_GET_ITEM(keys, i); 15336 if (PyUnicode_READY(s) == -1) { 15337 Py_UNREACHABLE(); 15338 } 15339 switch (PyUnicode_CHECK_INTERNED(s)) { 15340 case SSTATE_NOT_INTERNED: 15341 /* XXX Shouldn't happen */ 15342 break; 15343 case SSTATE_INTERNED_IMMORTAL: 15344 Py_REFCNT(s) += 1; 15345 immortal_size += PyUnicode_GET_LENGTH(s); 15346 break; 15347 case SSTATE_INTERNED_MORTAL: 15348 Py_REFCNT(s) += 2; 15349 mortal_size += PyUnicode_GET_LENGTH(s); 15350 break; 15351 default: 15352 Py_FatalError("Inconsistent interned string state."); 15353 } 15354 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15355 } 15356 fprintf(stderr, "total size of all interned strings: " 15357 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15358 "mortal/immortal\n", mortal_size, immortal_size); 15359 Py_DECREF(keys); 15360 PyDict_Clear(interned); 15361 Py_CLEAR(interned); 15362 } 15363 15364 15365 /********************* Unicode Iterator **************************/ 15366 15367 typedef struct { 15368 PyObject_HEAD 15369 Py_ssize_t it_index; 15370 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15371 } unicodeiterobject; 15372 15373 static void 15374 unicodeiter_dealloc(unicodeiterobject *it) 15375 { 15376 _PyObject_GC_UNTRACK(it); 15377 Py_XDECREF(it->it_seq); 15378 PyObject_GC_Del(it); 15379 } 15380 15381 static int 15382 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15383 { 15384 Py_VISIT(it->it_seq); 15385 return 0; 15386 } 15387 15388 static PyObject * 15389 unicodeiter_next(unicodeiterobject *it) 15390 { 15391 PyObject *seq, *item; 15392 15393 assert(it != NULL); 15394 seq = it->it_seq; 15395 if (seq == NULL) 15396 return NULL; 15397 assert(_PyUnicode_CHECK(seq)); 15398 15399 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15400 int kind = PyUnicode_KIND(seq); 15401 void *data = PyUnicode_DATA(seq); 15402 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15403 item = PyUnicode_FromOrdinal(chr); 15404 if (item != NULL) 15405 ++it->it_index; 15406 return item; 15407 } 15408 15409 it->it_seq = NULL; 15410 Py_DECREF(seq); 15411 return NULL; 15412 } 15413 15414 static PyObject * 15415 unicodeiter_len(unicodeiterobject *it) 15416 { 15417 Py_ssize_t len = 0; 15418 if (it->it_seq) 15419 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15420 return PyLong_FromSsize_t(len); 15421 } 15422 15423 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15424 15425 static PyObject * 15426 unicodeiter_reduce(unicodeiterobject *it) 15427 { 15428 if (it->it_seq != NULL) { 15429 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15430 it->it_seq, it->it_index); 15431 } else { 15432 PyObject *u = (PyObject *)_PyUnicode_New(0); 15433 if (u == NULL) 15434 return NULL; 15435 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15436 } 15437 } 15438 15439 PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15440 15441 static PyObject * 15442 unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15443 { 15444 Py_ssize_t index = PyLong_AsSsize_t(state); 15445 if (index == -1 && PyErr_Occurred()) 15446 return NULL; 15447 if (it->it_seq != NULL) { 15448 if (index < 0) 15449 index = 0; 15450 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15451 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15452 it->it_index = index; 15453 } 15454 Py_RETURN_NONE; 15455 } 15456 15457 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15458 15459 static PyMethodDef unicodeiter_methods[] = { 15460 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15461 length_hint_doc}, 15462 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15463 reduce_doc}, 15464 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15465 setstate_doc}, 15466 {NULL, NULL} /* sentinel */ 15467 }; 15468 15469 PyTypeObject PyUnicodeIter_Type = { 15470 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15471 "str_iterator", /* tp_name */ 15472 sizeof(unicodeiterobject), /* tp_basicsize */ 15473 0, /* tp_itemsize */ 15474 /* methods */ 15475 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15476 0, /* tp_print */ 15477 0, /* tp_getattr */ 15478 0, /* tp_setattr */ 15479 0, /* tp_reserved */ 15480 0, /* tp_repr */ 15481 0, /* tp_as_number */ 15482 0, /* tp_as_sequence */ 15483 0, /* tp_as_mapping */ 15484 0, /* tp_hash */ 15485 0, /* tp_call */ 15486 0, /* tp_str */ 15487 PyObject_GenericGetAttr, /* tp_getattro */ 15488 0, /* tp_setattro */ 15489 0, /* tp_as_buffer */ 15490 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15491 0, /* tp_doc */ 15492 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15493 0, /* tp_clear */ 15494 0, /* tp_richcompare */ 15495 0, /* tp_weaklistoffset */ 15496 PyObject_SelfIter, /* tp_iter */ 15497 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15498 unicodeiter_methods, /* tp_methods */ 15499 0, 15500 }; 15501 15502 static PyObject * 15503 unicode_iter(PyObject *seq) 15504 { 15505 unicodeiterobject *it; 15506 15507 if (!PyUnicode_Check(seq)) { 15508 PyErr_BadInternalCall(); 15509 return NULL; 15510 } 15511 if (PyUnicode_READY(seq) == -1) 15512 return NULL; 15513 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15514 if (it == NULL) 15515 return NULL; 15516 it->it_index = 0; 15517 Py_INCREF(seq); 15518 it->it_seq = seq; 15519 _PyObject_GC_TRACK(it); 15520 return (PyObject *)it; 15521 } 15522 15523 15524 size_t 15525 Py_UNICODE_strlen(const Py_UNICODE *u) 15526 { 15527 return wcslen(u); 15528 } 15529 15530 Py_UNICODE* 15531 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15532 { 15533 Py_UNICODE *u = s1; 15534 while ((*u++ = *s2++)); 15535 return s1; 15536 } 15537 15538 Py_UNICODE* 15539 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15540 { 15541 Py_UNICODE *u = s1; 15542 while ((*u++ = *s2++)) 15543 if (n-- == 0) 15544 break; 15545 return s1; 15546 } 15547 15548 Py_UNICODE* 15549 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15550 { 15551 Py_UNICODE *u1 = s1; 15552 u1 += wcslen(u1); 15553 while ((*u1++ = *s2++)); 15554 return s1; 15555 } 15556 15557 int 15558 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15559 { 15560 while (*s1 && *s2 && *s1 == *s2) 15561 s1++, s2++; 15562 if (*s1 && *s2) 15563 return (*s1 < *s2) ? -1 : +1; 15564 if (*s1) 15565 return 1; 15566 if (*s2) 15567 return -1; 15568 return 0; 15569 } 15570 15571 int 15572 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15573 { 15574 Py_UNICODE u1, u2; 15575 for (; n != 0; n--) { 15576 u1 = *s1; 15577 u2 = *s2; 15578 if (u1 != u2) 15579 return (u1 < u2) ? -1 : +1; 15580 if (u1 == '\0') 15581 return 0; 15582 s1++; 15583 s2++; 15584 } 15585 return 0; 15586 } 15587 15588 Py_UNICODE* 15589 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15590 { 15591 const Py_UNICODE *p; 15592 for (p = s; *p; p++) 15593 if (*p == c) 15594 return (Py_UNICODE*)p; 15595 return NULL; 15596 } 15597 15598 Py_UNICODE* 15599 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15600 { 15601 const Py_UNICODE *p; 15602 p = s + wcslen(s); 15603 while (p != s) { 15604 p--; 15605 if (*p == c) 15606 return (Py_UNICODE*)p; 15607 } 15608 return NULL; 15609 } 15610 15611 Py_UNICODE* 15612 PyUnicode_AsUnicodeCopy(PyObject *unicode) 15613 { 15614 Py_UNICODE *u, *copy; 15615 Py_ssize_t len, size; 15616 15617 if (!PyUnicode_Check(unicode)) { 15618 PyErr_BadArgument(); 15619 return NULL; 15620 } 15621 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15622 if (u == NULL) 15623 return NULL; 15624 /* Ensure we won't overflow the size. */ 15625 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15626 PyErr_NoMemory(); 15627 return NULL; 15628 } 15629 size = len + 1; /* copy the null character */ 15630 size *= sizeof(Py_UNICODE); 15631 copy = PyMem_Malloc(size); 15632 if (copy == NULL) { 15633 PyErr_NoMemory(); 15634 return NULL; 15635 } 15636 memcpy(copy, u, size); 15637 return copy; 15638 } 15639 15640 /* A _string module, to export formatter_parser and formatter_field_name_split 15641 to the string.Formatter class implemented in Python. */ 15642 15643 static PyMethodDef _string_methods[] = { 15644 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15645 METH_O, PyDoc_STR("split the argument as a field name")}, 15646 {"formatter_parser", (PyCFunction) formatter_parser, 15647 METH_O, PyDoc_STR("parse the argument as a format string")}, 15648 {NULL, NULL} 15649 }; 15650 15651 static struct PyModuleDef _string_module = { 15652 PyModuleDef_HEAD_INIT, 15653 "_string", 15654 PyDoc_STR("string helper module"), 15655 0, 15656 _string_methods, 15657 NULL, 15658 NULL, 15659 NULL, 15660 NULL 15661 }; 15662 15663 PyMODINIT_FUNC 15664 PyInit__string(void) 15665 { 15666 return PyModule_Create(&_string_module); 15667 } 15668 15669 15670 #ifdef __cplusplus 15671 } 15672 #endif 15673