1 /* ------------------------------------------------------------------------ 2 3 Python Codec Registry and support functions 4 5 Written by Marc-Andre Lemburg (mal (at) lemburg.com). 6 7 Copyright (c) Corporation for National Research Initiatives. 8 9 ------------------------------------------------------------------------ */ 10 11 #include "Python.h" 12 #include <ctype.h> 13 14 /* --- Codec Registry ----------------------------------------------------- */ 15 16 /* Import the standard encodings package which will register the first 17 codec search function. 18 19 This is done in a lazy way so that the Unicode implementation does 20 not downgrade startup time of scripts not needing it. 21 22 ImportErrors are silently ignored by this function. Only one try is 23 made. 24 25 */ 26 27 static int _PyCodecRegistry_Init(void); /* Forward */ 28 29 int PyCodec_Register(PyObject *search_function) 30 { 31 PyInterpreterState *interp = PyThreadState_GET()->interp; 32 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 33 goto onError; 34 if (search_function == NULL) { 35 PyErr_BadArgument(); 36 goto onError; 37 } 38 if (!PyCallable_Check(search_function)) { 39 PyErr_SetString(PyExc_TypeError, "argument must be callable"); 40 goto onError; 41 } 42 return PyList_Append(interp->codec_search_path, search_function); 43 44 onError: 45 return -1; 46 } 47 48 /* Convert a string to a normalized Python string: all characters are 49 converted to lower case, spaces are replaced with underscores. */ 50 51 static 52 PyObject *normalizestring(const char *string) 53 { 54 register size_t i; 55 size_t len = strlen(string); 56 char *p; 57 PyObject *v; 58 59 if (len > PY_SSIZE_T_MAX) { 60 PyErr_SetString(PyExc_OverflowError, "string is too large"); 61 return NULL; 62 } 63 64 v = PyString_FromStringAndSize(NULL, len); 65 if (v == NULL) 66 return NULL; 67 p = PyString_AS_STRING(v); 68 for (i = 0; i < len; i++) { 69 register char ch = string[i]; 70 if (ch == ' ') 71 ch = '-'; 72 else 73 ch = Py_TOLOWER(Py_CHARMASK(ch)); 74 p[i] = ch; 75 } 76 return v; 77 } 78 79 /* Lookup the given encoding and return a tuple providing the codec 80 facilities. 81 82 The encoding string is looked up converted to all lower-case 83 characters. This makes encodings looked up through this mechanism 84 effectively case-insensitive. 85 86 If no codec is found, a LookupError is set and NULL returned. 87 88 As side effect, this tries to load the encodings package, if not 89 yet done. This is part of the lazy load strategy for the encodings 90 package. 91 92 */ 93 94 PyObject *_PyCodec_Lookup(const char *encoding) 95 { 96 PyInterpreterState *interp; 97 PyObject *result, *args = NULL, *v; 98 Py_ssize_t i, len; 99 100 if (encoding == NULL) { 101 PyErr_BadArgument(); 102 goto onError; 103 } 104 105 interp = PyThreadState_GET()->interp; 106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 107 goto onError; 108 109 /* Convert the encoding to a normalized Python string: all 110 characters are converted to lower case, spaces and hyphens are 111 replaced with underscores. */ 112 v = normalizestring(encoding); 113 if (v == NULL) 114 goto onError; 115 PyString_InternInPlace(&v); 116 117 /* First, try to lookup the name in the registry dictionary */ 118 result = PyDict_GetItem(interp->codec_search_cache, v); 119 if (result != NULL) { 120 Py_INCREF(result); 121 Py_DECREF(v); 122 return result; 123 } 124 125 /* Next, scan the search functions in order of registration */ 126 args = PyTuple_New(1); 127 if (args == NULL) 128 goto onError; 129 PyTuple_SET_ITEM(args,0,v); 130 131 len = PyList_Size(interp->codec_search_path); 132 if (len < 0) 133 goto onError; 134 if (len == 0) { 135 PyErr_SetString(PyExc_LookupError, 136 "no codec search functions registered: " 137 "can't find encoding"); 138 goto onError; 139 } 140 141 for (i = 0; i < len; i++) { 142 PyObject *func; 143 144 func = PyList_GetItem(interp->codec_search_path, i); 145 if (func == NULL) 146 goto onError; 147 result = PyEval_CallObject(func, args); 148 if (result == NULL) 149 goto onError; 150 if (result == Py_None) { 151 Py_DECREF(result); 152 continue; 153 } 154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { 155 PyErr_SetString(PyExc_TypeError, 156 "codec search functions must return 4-tuples"); 157 Py_DECREF(result); 158 goto onError; 159 } 160 break; 161 } 162 if (i == len) { 163 /* XXX Perhaps we should cache misses too ? */ 164 PyErr_Format(PyExc_LookupError, 165 "unknown encoding: %s", encoding); 166 goto onError; 167 } 168 169 /* Cache and return the result */ 170 PyDict_SetItem(interp->codec_search_cache, v, result); 171 Py_DECREF(args); 172 return result; 173 174 onError: 175 Py_XDECREF(args); 176 return NULL; 177 } 178 179 static 180 PyObject *args_tuple(PyObject *object, 181 const char *errors) 182 { 183 PyObject *args; 184 185 args = PyTuple_New(1 + (errors != NULL)); 186 if (args == NULL) 187 return NULL; 188 Py_INCREF(object); 189 PyTuple_SET_ITEM(args,0,object); 190 if (errors) { 191 PyObject *v; 192 193 v = PyString_FromString(errors); 194 if (v == NULL) { 195 Py_DECREF(args); 196 return NULL; 197 } 198 PyTuple_SET_ITEM(args, 1, v); 199 } 200 return args; 201 } 202 203 /* Helper function to get a codec item */ 204 205 static 206 PyObject *codec_getitem(const char *encoding, int index) 207 { 208 PyObject *codecs; 209 PyObject *v; 210 211 codecs = _PyCodec_Lookup(encoding); 212 if (codecs == NULL) 213 return NULL; 214 v = PyTuple_GET_ITEM(codecs, index); 215 Py_DECREF(codecs); 216 Py_INCREF(v); 217 return v; 218 } 219 220 /* Helper functions to create an incremental codec. */ 221 static 222 PyObject *codec_makeincrementalcodec(PyObject *codec_info, 223 const char *errors, 224 const char *attrname) 225 { 226 PyObject *ret, *inccodec; 227 228 inccodec = PyObject_GetAttrString(codec_info, attrname); 229 if (inccodec == NULL) 230 return NULL; 231 if (errors) 232 ret = PyObject_CallFunction(inccodec, "s", errors); 233 else 234 ret = PyObject_CallFunction(inccodec, NULL); 235 Py_DECREF(inccodec); 236 return ret; 237 } 238 239 static 240 PyObject *codec_getincrementalcodec(const char *encoding, 241 const char *errors, 242 const char *attrname) 243 { 244 PyObject *codec_info, *ret; 245 246 codec_info = _PyCodec_Lookup(encoding); 247 if (codec_info == NULL) 248 return NULL; 249 ret = codec_makeincrementalcodec(codec_info, errors, attrname); 250 Py_DECREF(codec_info); 251 return ret; 252 } 253 254 /* Helper function to create a stream codec. */ 255 256 static 257 PyObject *codec_getstreamcodec(const char *encoding, 258 PyObject *stream, 259 const char *errors, 260 const int index) 261 { 262 PyObject *codecs, *streamcodec, *codeccls; 263 264 codecs = _PyCodec_Lookup(encoding); 265 if (codecs == NULL) 266 return NULL; 267 268 codeccls = PyTuple_GET_ITEM(codecs, index); 269 if (errors != NULL) 270 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); 271 else 272 streamcodec = PyObject_CallFunction(codeccls, "O", stream); 273 Py_DECREF(codecs); 274 return streamcodec; 275 } 276 277 /* Helpers to work with the result of _PyCodec_Lookup 278 279 */ 280 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, 281 const char *errors) 282 { 283 return codec_makeincrementalcodec(codec_info, errors, 284 "incrementaldecoder"); 285 } 286 287 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, 288 const char *errors) 289 { 290 return codec_makeincrementalcodec(codec_info, errors, 291 "incrementalencoder"); 292 } 293 294 295 /* Convenience APIs to query the Codec registry. 296 297 All APIs return a codec object with incremented refcount. 298 299 */ 300 301 PyObject *PyCodec_Encoder(const char *encoding) 302 { 303 return codec_getitem(encoding, 0); 304 } 305 306 PyObject *PyCodec_Decoder(const char *encoding) 307 { 308 return codec_getitem(encoding, 1); 309 } 310 311 PyObject *PyCodec_IncrementalEncoder(const char *encoding, 312 const char *errors) 313 { 314 return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); 315 } 316 317 PyObject *PyCodec_IncrementalDecoder(const char *encoding, 318 const char *errors) 319 { 320 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); 321 } 322 323 PyObject *PyCodec_StreamReader(const char *encoding, 324 PyObject *stream, 325 const char *errors) 326 { 327 return codec_getstreamcodec(encoding, stream, errors, 2); 328 } 329 330 PyObject *PyCodec_StreamWriter(const char *encoding, 331 PyObject *stream, 332 const char *errors) 333 { 334 return codec_getstreamcodec(encoding, stream, errors, 3); 335 } 336 337 /* Encode an object (e.g. a Unicode object) using the given encoding 338 and return the resulting encoded object (usually a Python string). 339 340 errors is passed to the encoder factory as argument if non-NULL. */ 341 342 static PyObject * 343 _PyCodec_EncodeInternal(PyObject *object, 344 PyObject *encoder, 345 const char *encoding, 346 const char *errors) 347 { 348 PyObject *args = NULL, *result = NULL; 349 PyObject *v; 350 351 args = args_tuple(object, errors); 352 if (args == NULL) 353 goto onError; 354 355 result = PyEval_CallObject(encoder,args); 356 if (result == NULL) 357 goto onError; 358 359 if (!PyTuple_Check(result) || 360 PyTuple_GET_SIZE(result) != 2) { 361 PyErr_SetString(PyExc_TypeError, 362 "encoder must return a tuple (object,integer)"); 363 goto onError; 364 } 365 v = PyTuple_GET_ITEM(result,0); 366 Py_INCREF(v); 367 /* We don't check or use the second (integer) entry. */ 368 369 Py_DECREF(args); 370 Py_DECREF(encoder); 371 Py_DECREF(result); 372 return v; 373 374 onError: 375 Py_XDECREF(result); 376 Py_XDECREF(args); 377 Py_XDECREF(encoder); 378 return NULL; 379 } 380 381 /* Decode an object (usually a Python string) using the given encoding 382 and return an equivalent object (e.g. a Unicode object). 383 384 errors is passed to the decoder factory as argument if non-NULL. */ 385 386 static PyObject * 387 _PyCodec_DecodeInternal(PyObject *object, 388 PyObject *decoder, 389 const char *encoding, 390 const char *errors) 391 { 392 PyObject *args = NULL, *result = NULL; 393 PyObject *v; 394 395 args = args_tuple(object, errors); 396 if (args == NULL) 397 goto onError; 398 399 result = PyEval_CallObject(decoder,args); 400 if (result == NULL) 401 goto onError; 402 if (!PyTuple_Check(result) || 403 PyTuple_GET_SIZE(result) != 2) { 404 PyErr_SetString(PyExc_TypeError, 405 "decoder must return a tuple (object,integer)"); 406 goto onError; 407 } 408 v = PyTuple_GET_ITEM(result,0); 409 Py_INCREF(v); 410 /* We don't check or use the second (integer) entry. */ 411 412 Py_DECREF(args); 413 Py_DECREF(decoder); 414 Py_DECREF(result); 415 return v; 416 417 onError: 418 Py_XDECREF(args); 419 Py_XDECREF(decoder); 420 Py_XDECREF(result); 421 return NULL; 422 } 423 424 /* Generic encoding/decoding API */ 425 PyObject *PyCodec_Encode(PyObject *object, 426 const char *encoding, 427 const char *errors) 428 { 429 PyObject *encoder; 430 431 encoder = PyCodec_Encoder(encoding); 432 if (encoder == NULL) 433 return NULL; 434 435 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 436 } 437 438 PyObject *PyCodec_Decode(PyObject *object, 439 const char *encoding, 440 const char *errors) 441 { 442 PyObject *decoder; 443 444 decoder = PyCodec_Decoder(encoding); 445 if (decoder == NULL) 446 return NULL; 447 448 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 449 } 450 451 /* Text encoding/decoding API */ 452 PyObject * _PyCodec_LookupTextEncoding(const char *encoding, 453 const char *alternate_command) 454 { 455 PyObject *codec; 456 PyObject *attr; 457 int is_text_codec; 458 459 codec = _PyCodec_Lookup(encoding); 460 if (codec == NULL) 461 return NULL; 462 463 /* Backwards compatibility: assume any raw tuple describes a text 464 * encoding, and the same for anything lacking the private 465 * attribute. 466 */ 467 if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) { 468 attr = PyObject_GetAttrString(codec, "_is_text_encoding"); 469 if (attr == NULL) { 470 if (!PyErr_ExceptionMatches(PyExc_AttributeError)) 471 goto onError; 472 PyErr_Clear(); 473 } else { 474 is_text_codec = PyObject_IsTrue(attr); 475 Py_DECREF(attr); 476 if (is_text_codec < 0) 477 goto onError; 478 if (!is_text_codec) { 479 PyObject *msg = PyString_FromFormat( 480 "'%.400s' is not a text encoding; " 481 "use %s to handle arbitrary codecs", 482 encoding, alternate_command); 483 if (msg == NULL) 484 goto onError; 485 if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) { 486 Py_DECREF(msg); 487 goto onError; 488 } 489 Py_DECREF(msg); 490 } 491 } 492 } 493 494 /* This appears to be a valid text encoding */ 495 return codec; 496 497 onError: 498 Py_DECREF(codec); 499 return NULL; 500 } 501 502 503 static 504 PyObject *codec_getitem_checked(const char *encoding, 505 const char *alternate_command, 506 int index) 507 { 508 PyObject *codec; 509 PyObject *v; 510 511 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); 512 if (codec == NULL) 513 return NULL; 514 515 v = PyTuple_GET_ITEM(codec, index); 516 Py_INCREF(v); 517 Py_DECREF(codec); 518 return v; 519 } 520 521 static PyObject * _PyCodec_TextEncoder(const char *encoding) 522 { 523 return codec_getitem_checked(encoding, "codecs.encode()", 0); 524 } 525 526 static PyObject * _PyCodec_TextDecoder(const char *encoding) 527 { 528 return codec_getitem_checked(encoding, "codecs.decode()", 1); 529 } 530 531 PyObject *_PyCodec_EncodeText(PyObject *object, 532 const char *encoding, 533 const char *errors) 534 { 535 PyObject *encoder; 536 537 encoder = _PyCodec_TextEncoder(encoding); 538 if (encoder == NULL) 539 return NULL; 540 541 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 542 } 543 544 PyObject *_PyCodec_DecodeText(PyObject *object, 545 const char *encoding, 546 const char *errors) 547 { 548 PyObject *decoder; 549 550 decoder = _PyCodec_TextDecoder(encoding); 551 if (decoder == NULL) 552 return NULL; 553 554 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 555 } 556 557 /* Register the error handling callback function error under the name 558 name. This function will be called by the codec when it encounters 559 an unencodable characters/undecodable bytes and doesn't know the 560 callback name, when name is specified as the error parameter 561 in the call to the encode/decode function. 562 Return 0 on success, -1 on error */ 563 int PyCodec_RegisterError(const char *name, PyObject *error) 564 { 565 PyInterpreterState *interp = PyThreadState_GET()->interp; 566 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 567 return -1; 568 if (!PyCallable_Check(error)) { 569 PyErr_SetString(PyExc_TypeError, "handler must be callable"); 570 return -1; 571 } 572 return PyDict_SetItemString(interp->codec_error_registry, 573 (char *)name, error); 574 } 575 576 /* Lookup the error handling callback function registered under the 577 name error. As a special case NULL can be passed, in which case 578 the error handling callback for strict encoding will be returned. */ 579 PyObject *PyCodec_LookupError(const char *name) 580 { 581 PyObject *handler = NULL; 582 583 PyInterpreterState *interp = PyThreadState_GET()->interp; 584 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 585 return NULL; 586 587 if (name==NULL) 588 name = "strict"; 589 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name); 590 if (!handler) 591 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); 592 else 593 Py_INCREF(handler); 594 return handler; 595 } 596 597 static void wrong_exception_type(PyObject *exc) 598 { 599 PyObject *type = PyObject_GetAttrString(exc, "__class__"); 600 if (type != NULL) { 601 PyObject *name = PyObject_GetAttrString(type, "__name__"); 602 Py_DECREF(type); 603 if (name != NULL) { 604 PyObject *string = PyObject_Str(name); 605 Py_DECREF(name); 606 if (string != NULL) { 607 PyErr_Format(PyExc_TypeError, 608 "don't know how to handle %.400s in error callback", 609 PyString_AS_STRING(string)); 610 Py_DECREF(string); 611 } 612 } 613 } 614 } 615 616 PyObject *PyCodec_StrictErrors(PyObject *exc) 617 { 618 if (PyExceptionInstance_Check(exc)) 619 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 620 else 621 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); 622 return NULL; 623 } 624 625 626 #ifdef Py_USING_UNICODE 627 PyObject *PyCodec_IgnoreErrors(PyObject *exc) 628 { 629 Py_ssize_t end; 630 631 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 632 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 633 return NULL; 634 } 635 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 636 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 637 return NULL; 638 } 639 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { 640 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 641 return NULL; 642 } 643 else { 644 wrong_exception_type(exc); 645 return NULL; 646 } 647 /* ouch: passing NULL, 0, pos gives None instead of u'' */ 648 return Py_BuildValue("(u#n)", &end, 0, end); 649 } 650 651 652 PyObject *PyCodec_ReplaceErrors(PyObject *exc) 653 { 654 PyObject *restuple; 655 Py_ssize_t start; 656 Py_ssize_t end; 657 Py_ssize_t i; 658 659 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 660 PyObject *res; 661 Py_UNICODE *p; 662 if (PyUnicodeEncodeError_GetStart(exc, &start)) 663 return NULL; 664 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 665 return NULL; 666 res = PyUnicode_FromUnicode(NULL, end-start); 667 if (res == NULL) 668 return NULL; 669 for (p = PyUnicode_AS_UNICODE(res), i = start; 670 i<end; ++p, ++i) 671 *p = '?'; 672 restuple = Py_BuildValue("(On)", res, end); 673 Py_DECREF(res); 674 return restuple; 675 } 676 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { 677 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; 678 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 679 return NULL; 680 return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end); 681 } 682 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { 683 PyObject *res; 684 Py_UNICODE *p; 685 if (PyUnicodeTranslateError_GetStart(exc, &start)) 686 return NULL; 687 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 688 return NULL; 689 res = PyUnicode_FromUnicode(NULL, end-start); 690 if (res == NULL) 691 return NULL; 692 for (p = PyUnicode_AS_UNICODE(res), i = start; 693 i<end; ++p, ++i) 694 *p = Py_UNICODE_REPLACEMENT_CHARACTER; 695 restuple = Py_BuildValue("(On)", res, end); 696 Py_DECREF(res); 697 return restuple; 698 } 699 else { 700 wrong_exception_type(exc); 701 return NULL; 702 } 703 } 704 705 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) 706 { 707 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 708 PyObject *restuple; 709 PyObject *object; 710 Py_ssize_t start; 711 Py_ssize_t end; 712 PyObject *res; 713 Py_UNICODE *p; 714 Py_UNICODE *startp; 715 Py_UNICODE *e; 716 Py_UNICODE *outp; 717 Py_ssize_t ressize; 718 if (PyUnicodeEncodeError_GetStart(exc, &start)) 719 return NULL; 720 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 721 return NULL; 722 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 723 return NULL; 724 startp = PyUnicode_AS_UNICODE(object); 725 if (end - start > PY_SSIZE_T_MAX / (2+7+1)) { 726 end = start + PY_SSIZE_T_MAX / (2+7+1); 727 #ifndef Py_UNICODE_WIDE 728 if (0xD800 <= startp[end - 1] && startp[end - 1] <= 0xDBFF) 729 end--; 730 #endif 731 } 732 e = startp + end; 733 for (p = startp+start, ressize = 0; p < e;) { 734 Py_UCS4 ch = *p++; 735 #ifndef Py_UNICODE_WIDE 736 if ((0xD800 <= ch && ch <= 0xDBFF) && 737 (p < e) && 738 (0xDC00 <= *p && *p <= 0xDFFF)) { 739 ch = ((((ch & 0x03FF) << 10) | 740 ((Py_UCS4)*p++ & 0x03FF)) + 0x10000); 741 } 742 #endif 743 if (ch < 10) 744 ressize += 2+1+1; 745 else if (ch < 100) 746 ressize += 2+2+1; 747 else if (ch < 1000) 748 ressize += 2+3+1; 749 else if (ch < 10000) 750 ressize += 2+4+1; 751 else if (ch < 100000) 752 ressize += 2+5+1; 753 else if (ch < 1000000) 754 ressize += 2+6+1; 755 else 756 ressize += 2+7+1; 757 } 758 /* allocate replacement */ 759 res = PyUnicode_FromUnicode(NULL, ressize); 760 if (res == NULL) { 761 Py_DECREF(object); 762 return NULL; 763 } 764 /* generate replacement */ 765 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) { 766 int digits; 767 int base; 768 Py_UCS4 ch = *p++; 769 #ifndef Py_UNICODE_WIDE 770 if ((0xD800 <= ch && ch <= 0xDBFF) && 771 (p < startp+end) && 772 (0xDC00 <= *p && *p <= 0xDFFF)) { 773 ch = ((((ch & 0x03FF) << 10) | 774 ((Py_UCS4)*p++ & 0x03FF)) + 0x10000); 775 } 776 #endif 777 *outp++ = '&'; 778 *outp++ = '#'; 779 if (ch < 10) { 780 digits = 1; 781 base = 1; 782 } 783 else if (ch < 100) { 784 digits = 2; 785 base = 10; 786 } 787 else if (ch < 1000) { 788 digits = 3; 789 base = 100; 790 } 791 else if (ch < 10000) { 792 digits = 4; 793 base = 1000; 794 } 795 else if (ch < 100000) { 796 digits = 5; 797 base = 10000; 798 } 799 else if (ch < 1000000) { 800 digits = 6; 801 base = 100000; 802 } 803 else { 804 digits = 7; 805 base = 1000000; 806 } 807 while (digits-->0) { 808 *outp++ = '0' + ch/base; 809 ch %= base; 810 base /= 10; 811 } 812 *outp++ = ';'; 813 } 814 restuple = Py_BuildValue("(On)", res, end); 815 Py_DECREF(res); 816 Py_DECREF(object); 817 return restuple; 818 } 819 else { 820 wrong_exception_type(exc); 821 return NULL; 822 } 823 } 824 825 static Py_UNICODE hexdigits[] = { 826 '0', '1', '2', '3', '4', '5', '6', '7', 827 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' 828 }; 829 830 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) 831 { 832 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { 833 PyObject *restuple; 834 PyObject *object; 835 Py_ssize_t start; 836 Py_ssize_t end; 837 PyObject *res; 838 Py_UNICODE *p; 839 Py_UNICODE *startp; 840 Py_UNICODE *outp; 841 Py_ssize_t ressize; 842 if (PyUnicodeEncodeError_GetStart(exc, &start)) 843 return NULL; 844 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 845 return NULL; 846 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 847 return NULL; 848 if (end - start > PY_SSIZE_T_MAX / (1+1+8)) 849 end = start + PY_SSIZE_T_MAX / (1+1+8); 850 startp = PyUnicode_AS_UNICODE(object); 851 for (p = startp+start, ressize = 0; p < startp+end; ++p) { 852 #ifdef Py_UNICODE_WIDE 853 if (*p >= 0x00010000) 854 ressize += 1+1+8; 855 else 856 #endif 857 if (*p >= 0x100) { 858 ressize += 1+1+4; 859 } 860 else 861 ressize += 1+1+2; 862 } 863 res = PyUnicode_FromUnicode(NULL, ressize); 864 if (res == NULL) { 865 Py_DECREF(object); 866 return NULL; 867 } 868 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); 869 p < startp+end; ++p) { 870 Py_UNICODE c = *p; 871 *outp++ = '\\'; 872 #ifdef Py_UNICODE_WIDE 873 if (c >= 0x00010000) { 874 *outp++ = 'U'; 875 *outp++ = hexdigits[(c>>28)&0xf]; 876 *outp++ = hexdigits[(c>>24)&0xf]; 877 *outp++ = hexdigits[(c>>20)&0xf]; 878 *outp++ = hexdigits[(c>>16)&0xf]; 879 *outp++ = hexdigits[(c>>12)&0xf]; 880 *outp++ = hexdigits[(c>>8)&0xf]; 881 } 882 else 883 #endif 884 if (c >= 0x100) { 885 *outp++ = 'u'; 886 *outp++ = hexdigits[(c>>12)&0xf]; 887 *outp++ = hexdigits[(c>>8)&0xf]; 888 } 889 else 890 *outp++ = 'x'; 891 *outp++ = hexdigits[(c>>4)&0xf]; 892 *outp++ = hexdigits[c&0xf]; 893 } 894 895 restuple = Py_BuildValue("(On)", res, end); 896 Py_DECREF(res); 897 Py_DECREF(object); 898 return restuple; 899 } 900 else { 901 wrong_exception_type(exc); 902 return NULL; 903 } 904 } 905 #endif 906 907 static PyObject *strict_errors(PyObject *self, PyObject *exc) 908 { 909 return PyCodec_StrictErrors(exc); 910 } 911 912 913 #ifdef Py_USING_UNICODE 914 static PyObject *ignore_errors(PyObject *self, PyObject *exc) 915 { 916 return PyCodec_IgnoreErrors(exc); 917 } 918 919 920 static PyObject *replace_errors(PyObject *self, PyObject *exc) 921 { 922 return PyCodec_ReplaceErrors(exc); 923 } 924 925 926 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) 927 { 928 return PyCodec_XMLCharRefReplaceErrors(exc); 929 } 930 931 932 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) 933 { 934 return PyCodec_BackslashReplaceErrors(exc); 935 } 936 #endif 937 938 static int _PyCodecRegistry_Init(void) 939 { 940 static struct { 941 char *name; 942 PyMethodDef def; 943 } methods[] = 944 { 945 { 946 "strict", 947 { 948 "strict_errors", 949 strict_errors, 950 METH_O, 951 PyDoc_STR("Implements the 'strict' error handling, which " 952 "raises a UnicodeError on coding errors.") 953 } 954 }, 955 #ifdef Py_USING_UNICODE 956 { 957 "ignore", 958 { 959 "ignore_errors", 960 ignore_errors, 961 METH_O, 962 PyDoc_STR("Implements the 'ignore' error handling, which " 963 "ignores malformed data and continues.") 964 } 965 }, 966 { 967 "replace", 968 { 969 "replace_errors", 970 replace_errors, 971 METH_O, 972 PyDoc_STR("Implements the 'replace' error handling, which " 973 "replaces malformed data with a replacement marker.") 974 } 975 }, 976 { 977 "xmlcharrefreplace", 978 { 979 "xmlcharrefreplace_errors", 980 xmlcharrefreplace_errors, 981 METH_O, 982 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " 983 "which replaces an unencodable character with the " 984 "appropriate XML character reference.") 985 } 986 }, 987 { 988 "backslashreplace", 989 { 990 "backslashreplace_errors", 991 backslashreplace_errors, 992 METH_O, 993 PyDoc_STR("Implements the 'backslashreplace' error handling, " 994 "which replaces an unencodable character with a " 995 "backslashed escape sequence.") 996 } 997 } 998 #endif 999 }; 1000 1001 PyInterpreterState *interp = PyThreadState_GET()->interp; 1002 PyObject *mod; 1003 unsigned i; 1004 1005 if (interp->codec_search_path != NULL) 1006 return 0; 1007 1008 interp->codec_search_path = PyList_New(0); 1009 interp->codec_search_cache = PyDict_New(); 1010 interp->codec_error_registry = PyDict_New(); 1011 1012 if (interp->codec_error_registry) { 1013 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) { 1014 PyObject *func = PyCFunction_New(&methods[i].def, NULL); 1015 int res; 1016 if (!func) 1017 Py_FatalError("can't initialize codec error registry"); 1018 res = PyCodec_RegisterError(methods[i].name, func); 1019 Py_DECREF(func); 1020 if (res) 1021 Py_FatalError("can't initialize codec error registry"); 1022 } 1023 } 1024 1025 if (interp->codec_search_path == NULL || 1026 interp->codec_search_cache == NULL || 1027 interp->codec_error_registry == NULL) 1028 Py_FatalError("can't initialize codec registry"); 1029 1030 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0); 1031 if (mod == NULL) { 1032 if (PyErr_ExceptionMatches(PyExc_ImportError)) { 1033 /* Ignore ImportErrors... this is done so that 1034 distributions can disable the encodings package. Note 1035 that other errors are not masked, e.g. SystemErrors 1036 raised to inform the user of an error in the Python 1037 configuration are still reported back to the user. */ 1038 PyErr_Clear(); 1039 return 0; 1040 } 1041 return -1; 1042 } 1043 Py_DECREF(mod); 1044 return 0; 1045 } 1046