1 """ codecs -- Python Codec Registry, API and helpers. 2 3 4 Written by Marc-Andre Lemburg (mal (at] lemburg.com). 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import builtins, sys 11 12 ### Registry and builtin stateless codec functions 13 14 try: 15 from _codecs import * 16 except ImportError as why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 24 "StreamReader", "StreamWriter", 25 "StreamReaderWriter", "StreamRecoder", 26 "getencoder", "getdecoder", "getincrementalencoder", 27 "getincrementaldecoder", "getreader", "getwriter", 28 "encode", "decode", "iterencode", "iterdecode", 29 "strict_errors", "ignore_errors", "replace_errors", 30 "xmlcharrefreplace_errors", 31 "backslashreplace_errors", "namereplace_errors", 32 "register_error", "lookup_error"] 33 34 ### Constants 35 36 # 37 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 38 # and its possible byte string values 39 # for UTF8/UTF16/UTF32 output and little/big endian machines 40 # 41 42 # UTF-8 43 BOM_UTF8 = b'\xef\xbb\xbf' 44 45 # UTF-16, little endian 46 BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 47 48 # UTF-16, big endian 49 BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 50 51 # UTF-32, little endian 52 BOM_UTF32_LE = b'\xff\xfe\x00\x00' 53 54 # UTF-32, big endian 55 BOM_UTF32_BE = b'\x00\x00\xfe\xff' 56 57 if sys.byteorder == 'little': 58 59 # UTF-16, native endianness 60 BOM = BOM_UTF16 = BOM_UTF16_LE 61 62 # UTF-32, native endianness 63 BOM_UTF32 = BOM_UTF32_LE 64 65 else: 66 67 # UTF-16, native endianness 68 BOM = BOM_UTF16 = BOM_UTF16_BE 69 70 # UTF-32, native endianness 71 BOM_UTF32 = BOM_UTF32_BE 72 73 # Old broken names (don't use in new code) 74 BOM32_LE = BOM_UTF16_LE 75 BOM32_BE = BOM_UTF16_BE 76 BOM64_LE = BOM_UTF32_LE 77 BOM64_BE = BOM_UTF32_BE 78 79 80 ### Codec base classes (defining the API) 81 82 class CodecInfo(tuple): 83 """Codec details when looking up the codec registry""" 84 85 # Private API to allow Python 3.4 to blacklist the known non-Unicode 86 # codecs in the standard library. A more general mechanism to 87 # reliably distinguish test encodings from other codecs will hopefully 88 # be defined for Python 3.5 89 # 90 # See http://bugs.python.org/issue19619 91 _is_text_encoding = True # Assume codecs are text encodings by default 92 93 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 94 incrementalencoder=None, incrementaldecoder=None, name=None, 95 *, _is_text_encoding=None): 96 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 97 self.name = name 98 self.encode = encode 99 self.decode = decode 100 self.incrementalencoder = incrementalencoder 101 self.incrementaldecoder = incrementaldecoder 102 self.streamwriter = streamwriter 103 self.streamreader = streamreader 104 if _is_text_encoding is not None: 105 self._is_text_encoding = _is_text_encoding 106 return self 107 108 def __repr__(self): 109 return "<%s.%s object for encoding %s at %#x>" % \ 110 (self.__class__.__module__, self.__class__.__qualname__, 111 self.name, id(self)) 112 113 class Codec: 114 115 """ Defines the interface for stateless encoders/decoders. 116 117 The .encode()/.decode() methods may use different error 118 handling schemes by providing the errors argument. These 119 string values are predefined: 120 121 'strict' - raise a ValueError error (or a subclass) 122 'ignore' - ignore the character and continue with the next 123 'replace' - replace with a suitable replacement character; 124 Python will use the official U+FFFD REPLACEMENT 125 CHARACTER for the builtin Unicode codecs on 126 decoding and '?' on encoding. 127 'surrogateescape' - replace with private code points U+DCnn. 128 'xmlcharrefreplace' - Replace with the appropriate XML 129 character reference (only for encoding). 130 'backslashreplace' - Replace with backslashed escape sequences. 131 'namereplace' - Replace with \\N{...} escape sequences 132 (only for encoding). 133 134 The set of allowed values can be extended via register_error. 135 136 """ 137 def encode(self, input, errors='strict'): 138 139 """ Encodes the object input and returns a tuple (output 140 object, length consumed). 141 142 errors defines the error handling to apply. It defaults to 143 'strict' handling. 144 145 The method may not store state in the Codec instance. Use 146 StreamWriter for codecs which have to keep state in order to 147 make encoding efficient. 148 149 The encoder must be able to handle zero length input and 150 return an empty object of the output object type in this 151 situation. 152 153 """ 154 raise NotImplementedError 155 156 def decode(self, input, errors='strict'): 157 158 """ Decodes the object input and returns a tuple (output 159 object, length consumed). 160 161 input must be an object which provides the bf_getreadbuf 162 buffer slot. Python strings, buffer objects and memory 163 mapped files are examples of objects providing this slot. 164 165 errors defines the error handling to apply. It defaults to 166 'strict' handling. 167 168 The method may not store state in the Codec instance. Use 169 StreamReader for codecs which have to keep state in order to 170 make decoding efficient. 171 172 The decoder must be able to handle zero length input and 173 return an empty object of the output object type in this 174 situation. 175 176 """ 177 raise NotImplementedError 178 179 class IncrementalEncoder(object): 180 """ 181 An IncrementalEncoder encodes an input in multiple steps. The input can 182 be passed piece by piece to the encode() method. The IncrementalEncoder 183 remembers the state of the encoding process between calls to encode(). 184 """ 185 def __init__(self, errors='strict'): 186 """ 187 Creates an IncrementalEncoder instance. 188 189 The IncrementalEncoder may use different error handling schemes by 190 providing the errors keyword argument. See the module docstring 191 for a list of possible values. 192 """ 193 self.errors = errors 194 self.buffer = "" 195 196 def encode(self, input, final=False): 197 """ 198 Encodes input and returns the resulting object. 199 """ 200 raise NotImplementedError 201 202 def reset(self): 203 """ 204 Resets the encoder to the initial state. 205 """ 206 207 def getstate(self): 208 """ 209 Return the current state of the encoder. 210 """ 211 return 0 212 213 def setstate(self, state): 214 """ 215 Set the current state of the encoder. state must have been 216 returned by getstate(). 217 """ 218 219 class BufferedIncrementalEncoder(IncrementalEncoder): 220 """ 221 This subclass of IncrementalEncoder can be used as the baseclass for an 222 incremental encoder if the encoder must keep some of the output in a 223 buffer between calls to encode(). 224 """ 225 def __init__(self, errors='strict'): 226 IncrementalEncoder.__init__(self, errors) 227 # unencoded input that is kept between calls to encode() 228 self.buffer = "" 229 230 def _buffer_encode(self, input, errors, final): 231 # Overwrite this method in subclasses: It must encode input 232 # and return an (output, length consumed) tuple 233 raise NotImplementedError 234 235 def encode(self, input, final=False): 236 # encode input (taking the buffer into account) 237 data = self.buffer + input 238 (result, consumed) = self._buffer_encode(data, self.errors, final) 239 # keep unencoded input until the next call 240 self.buffer = data[consumed:] 241 return result 242 243 def reset(self): 244 IncrementalEncoder.reset(self) 245 self.buffer = "" 246 247 def getstate(self): 248 return self.buffer or 0 249 250 def setstate(self, state): 251 self.buffer = state or "" 252 253 class IncrementalDecoder(object): 254 """ 255 An IncrementalDecoder decodes an input in multiple steps. The input can 256 be passed piece by piece to the decode() method. The IncrementalDecoder 257 remembers the state of the decoding process between calls to decode(). 258 """ 259 def __init__(self, errors='strict'): 260 """ 261 Create an IncrementalDecoder instance. 262 263 The IncrementalDecoder may use different error handling schemes by 264 providing the errors keyword argument. See the module docstring 265 for a list of possible values. 266 """ 267 self.errors = errors 268 269 def decode(self, input, final=False): 270 """ 271 Decode input and returns the resulting object. 272 """ 273 raise NotImplementedError 274 275 def reset(self): 276 """ 277 Reset the decoder to the initial state. 278 """ 279 280 def getstate(self): 281 """ 282 Return the current state of the decoder. 283 284 This must be a (buffered_input, additional_state_info) tuple. 285 buffered_input must be a bytes object containing bytes that 286 were passed to decode() that have not yet been converted. 287 additional_state_info must be a non-negative integer 288 representing the state of the decoder WITHOUT yet having 289 processed the contents of buffered_input. In the initial state 290 and after reset(), getstate() must return (b"", 0). 291 """ 292 return (b"", 0) 293 294 def setstate(self, state): 295 """ 296 Set the current state of the decoder. 297 298 state must have been returned by getstate(). The effect of 299 setstate((b"", 0)) must be equivalent to reset(). 300 """ 301 302 class BufferedIncrementalDecoder(IncrementalDecoder): 303 """ 304 This subclass of IncrementalDecoder can be used as the baseclass for an 305 incremental decoder if the decoder must be able to handle incomplete 306 byte sequences. 307 """ 308 def __init__(self, errors='strict'): 309 IncrementalDecoder.__init__(self, errors) 310 # undecoded input that is kept between calls to decode() 311 self.buffer = b"" 312 313 def _buffer_decode(self, input, errors, final): 314 # Overwrite this method in subclasses: It must decode input 315 # and return an (output, length consumed) tuple 316 raise NotImplementedError 317 318 def decode(self, input, final=False): 319 # decode input (taking the buffer into account) 320 data = self.buffer + input 321 (result, consumed) = self._buffer_decode(data, self.errors, final) 322 # keep undecoded input until the next call 323 self.buffer = data[consumed:] 324 return result 325 326 def reset(self): 327 IncrementalDecoder.reset(self) 328 self.buffer = b"" 329 330 def getstate(self): 331 # additional state info is always 0 332 return (self.buffer, 0) 333 334 def setstate(self, state): 335 # ignore additional state info 336 self.buffer = state[0] 337 338 # 339 # The StreamWriter and StreamReader class provide generic working 340 # interfaces which can be used to implement new encoding submodules 341 # very easily. See encodings/utf_8.py for an example on how this is 342 # done. 343 # 344 345 class StreamWriter(Codec): 346 347 def __init__(self, stream, errors='strict'): 348 349 """ Creates a StreamWriter instance. 350 351 stream must be a file-like object open for writing. 352 353 The StreamWriter may use different error handling 354 schemes by providing the errors keyword argument. These 355 parameters are predefined: 356 357 'strict' - raise a ValueError (or a subclass) 358 'ignore' - ignore the character and continue with the next 359 'replace'- replace with a suitable replacement character 360 'xmlcharrefreplace' - Replace with the appropriate XML 361 character reference. 362 'backslashreplace' - Replace with backslashed escape 363 sequences. 364 'namereplace' - Replace with \\N{...} escape sequences. 365 366 The set of allowed parameter values can be extended via 367 register_error. 368 """ 369 self.stream = stream 370 self.errors = errors 371 372 def write(self, object): 373 374 """ Writes the object's contents encoded to self.stream. 375 """ 376 data, consumed = self.encode(object, self.errors) 377 self.stream.write(data) 378 379 def writelines(self, list): 380 381 """ Writes the concatenated list of strings to the stream 382 using .write(). 383 """ 384 self.write(''.join(list)) 385 386 def reset(self): 387 388 """ Flushes and resets the codec buffers used for keeping state. 389 390 Calling this method should ensure that the data on the 391 output is put into a clean state, that allows appending 392 of new fresh data without having to rescan the whole 393 stream to recover state. 394 395 """ 396 pass 397 398 def seek(self, offset, whence=0): 399 self.stream.seek(offset, whence) 400 if whence == 0 and offset == 0: 401 self.reset() 402 403 def __getattr__(self, name, 404 getattr=getattr): 405 406 """ Inherit all other methods from the underlying stream. 407 """ 408 return getattr(self.stream, name) 409 410 def __enter__(self): 411 return self 412 413 def __exit__(self, type, value, tb): 414 self.stream.close() 415 416 ### 417 418 class StreamReader(Codec): 419 420 charbuffertype = str 421 422 def __init__(self, stream, errors='strict'): 423 424 """ Creates a StreamReader instance. 425 426 stream must be a file-like object open for reading. 427 428 The StreamReader may use different error handling 429 schemes by providing the errors keyword argument. These 430 parameters are predefined: 431 432 'strict' - raise a ValueError (or a subclass) 433 'ignore' - ignore the character and continue with the next 434 'replace'- replace with a suitable replacement character 435 'backslashreplace' - Replace with backslashed escape sequences; 436 437 The set of allowed parameter values can be extended via 438 register_error. 439 """ 440 self.stream = stream 441 self.errors = errors 442 self.bytebuffer = b"" 443 self._empty_charbuffer = self.charbuffertype() 444 self.charbuffer = self._empty_charbuffer 445 self.linebuffer = None 446 447 def decode(self, input, errors='strict'): 448 raise NotImplementedError 449 450 def read(self, size=-1, chars=-1, firstline=False): 451 452 """ Decodes data from the stream self.stream and returns the 453 resulting object. 454 455 chars indicates the number of decoded code points or bytes to 456 return. read() will never return more data than requested, 457 but it might return less, if there is not enough available. 458 459 size indicates the approximate maximum number of decoded 460 bytes or code points to read for decoding. The decoder 461 can modify this setting as appropriate. The default value 462 -1 indicates to read and decode as much as possible. size 463 is intended to prevent having to decode huge files in one 464 step. 465 466 If firstline is true, and a UnicodeDecodeError happens 467 after the first line terminator in the input only the first line 468 will be returned, the rest of the input will be kept until the 469 next call to read(). 470 471 The method should use a greedy read strategy, meaning that 472 it should read as much data as is allowed within the 473 definition of the encoding and the given size, e.g. if 474 optional encoding endings or state markers are available 475 on the stream, these should be read too. 476 """ 477 # If we have lines cached, first merge them back into characters 478 if self.linebuffer: 479 self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 480 self.linebuffer = None 481 482 # read until we get the required number of characters (if available) 483 while True: 484 # can the request be satisfied from the character buffer? 485 if chars >= 0: 486 if len(self.charbuffer) >= chars: 487 break 488 elif size >= 0: 489 if len(self.charbuffer) >= size: 490 break 491 # we need more data 492 if size < 0: 493 newdata = self.stream.read() 494 else: 495 newdata = self.stream.read(size) 496 # decode bytes (those remaining from the last call included) 497 data = self.bytebuffer + newdata 498 if not data: 499 break 500 try: 501 newchars, decodedbytes = self.decode(data, self.errors) 502 except UnicodeDecodeError as exc: 503 if firstline: 504 newchars, decodedbytes = \ 505 self.decode(data[:exc.start], self.errors) 506 lines = newchars.splitlines(keepends=True) 507 if len(lines)<=1: 508 raise 509 else: 510 raise 511 # keep undecoded bytes until the next call 512 self.bytebuffer = data[decodedbytes:] 513 # put new characters in the character buffer 514 self.charbuffer += newchars 515 # there was no data available 516 if not newdata: 517 break 518 if chars < 0: 519 # Return everything we've got 520 result = self.charbuffer 521 self.charbuffer = self._empty_charbuffer 522 else: 523 # Return the first chars characters 524 result = self.charbuffer[:chars] 525 self.charbuffer = self.charbuffer[chars:] 526 return result 527 528 def readline(self, size=None, keepends=True): 529 530 """ Read one line from the input stream and return the 531 decoded data. 532 533 size, if given, is passed as size argument to the 534 read() method. 535 536 """ 537 # If we have lines cached from an earlier read, return 538 # them unconditionally 539 if self.linebuffer: 540 line = self.linebuffer[0] 541 del self.linebuffer[0] 542 if len(self.linebuffer) == 1: 543 # revert to charbuffer mode; we might need more data 544 # next time 545 self.charbuffer = self.linebuffer[0] 546 self.linebuffer = None 547 if not keepends: 548 line = line.splitlines(keepends=False)[0] 549 return line 550 551 readsize = size or 72 552 line = self._empty_charbuffer 553 # If size is given, we call read() only once 554 while True: 555 data = self.read(readsize, firstline=True) 556 if data: 557 # If we're at a "\r" read one extra character (which might 558 # be a "\n") to get a proper line ending. If the stream is 559 # temporarily exhausted we return the wrong line ending. 560 if (isinstance(data, str) and data.endswith("\r")) or \ 561 (isinstance(data, bytes) and data.endswith(b"\r")): 562 data += self.read(size=1, chars=1) 563 564 line += data 565 lines = line.splitlines(keepends=True) 566 if lines: 567 if len(lines) > 1: 568 # More than one line result; the first line is a full line 569 # to return 570 line = lines[0] 571 del lines[0] 572 if len(lines) > 1: 573 # cache the remaining lines 574 lines[-1] += self.charbuffer 575 self.linebuffer = lines 576 self.charbuffer = None 577 else: 578 # only one remaining line, put it back into charbuffer 579 self.charbuffer = lines[0] + self.charbuffer 580 if not keepends: 581 line = line.splitlines(keepends=False)[0] 582 break 583 line0withend = lines[0] 584 line0withoutend = lines[0].splitlines(keepends=False)[0] 585 if line0withend != line0withoutend: # We really have a line end 586 # Put the rest back together and keep it until the next call 587 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 588 self.charbuffer 589 if keepends: 590 line = line0withend 591 else: 592 line = line0withoutend 593 break 594 # we didn't get anything or this was our only try 595 if not data or size is not None: 596 if line and not keepends: 597 line = line.splitlines(keepends=False)[0] 598 break 599 if readsize < 8000: 600 readsize *= 2 601 return line 602 603 def readlines(self, sizehint=None, keepends=True): 604 605 """ Read all lines available on the input stream 606 and return them as a list. 607 608 Line breaks are implemented using the codec's decoder 609 method and are included in the list entries. 610 611 sizehint, if given, is ignored since there is no efficient 612 way to finding the true end-of-line. 613 614 """ 615 data = self.read() 616 return data.splitlines(keepends) 617 618 def reset(self): 619 620 """ Resets the codec buffers used for keeping state. 621 622 Note that no stream repositioning should take place. 623 This method is primarily intended to be able to recover 624 from decoding errors. 625 626 """ 627 self.bytebuffer = b"" 628 self.charbuffer = self._empty_charbuffer 629 self.linebuffer = None 630 631 def seek(self, offset, whence=0): 632 """ Set the input stream's current position. 633 634 Resets the codec buffers used for keeping state. 635 """ 636 self.stream.seek(offset, whence) 637 self.reset() 638 639 def __next__(self): 640 641 """ Return the next decoded line from the input stream.""" 642 line = self.readline() 643 if line: 644 return line 645 raise StopIteration 646 647 def __iter__(self): 648 return self 649 650 def __getattr__(self, name, 651 getattr=getattr): 652 653 """ Inherit all other methods from the underlying stream. 654 """ 655 return getattr(self.stream, name) 656 657 def __enter__(self): 658 return self 659 660 def __exit__(self, type, value, tb): 661 self.stream.close() 662 663 ### 664 665 class StreamReaderWriter: 666 667 """ StreamReaderWriter instances allow wrapping streams which 668 work in both read and write modes. 669 670 The design is such that one can use the factory functions 671 returned by the codec.lookup() function to construct the 672 instance. 673 674 """ 675 # Optional attributes set by the file wrappers below 676 encoding = 'unknown' 677 678 def __init__(self, stream, Reader, Writer, errors='strict'): 679 680 """ Creates a StreamReaderWriter instance. 681 682 stream must be a Stream-like object. 683 684 Reader, Writer must be factory functions or classes 685 providing the StreamReader, StreamWriter interface resp. 686 687 Error handling is done in the same way as defined for the 688 StreamWriter/Readers. 689 690 """ 691 self.stream = stream 692 self.reader = Reader(stream, errors) 693 self.writer = Writer(stream, errors) 694 self.errors = errors 695 696 def read(self, size=-1): 697 698 return self.reader.read(size) 699 700 def readline(self, size=None): 701 702 return self.reader.readline(size) 703 704 def readlines(self, sizehint=None): 705 706 return self.reader.readlines(sizehint) 707 708 def __next__(self): 709 710 """ Return the next decoded line from the input stream.""" 711 return next(self.reader) 712 713 def __iter__(self): 714 return self 715 716 def write(self, data): 717 718 return self.writer.write(data) 719 720 def writelines(self, list): 721 722 return self.writer.writelines(list) 723 724 def reset(self): 725 726 self.reader.reset() 727 self.writer.reset() 728 729 def seek(self, offset, whence=0): 730 self.stream.seek(offset, whence) 731 self.reader.reset() 732 if whence == 0 and offset == 0: 733 self.writer.reset() 734 735 def __getattr__(self, name, 736 getattr=getattr): 737 738 """ Inherit all other methods from the underlying stream. 739 """ 740 return getattr(self.stream, name) 741 742 # these are needed to make "with codecs.open(...)" work properly 743 744 def __enter__(self): 745 return self 746 747 def __exit__(self, type, value, tb): 748 self.stream.close() 749 750 ### 751 752 class StreamRecoder: 753 754 """ StreamRecoder instances translate data from one encoding to another. 755 756 They use the complete set of APIs returned by the 757 codecs.lookup() function to implement their task. 758 759 Data written to the StreamRecoder is first decoded into an 760 intermediate format (depending on the "decode" codec) and then 761 written to the underlying stream using an instance of the provided 762 Writer class. 763 764 In the other direction, data is read from the underlying stream using 765 a Reader instance and then encoded and returned to the caller. 766 767 """ 768 # Optional attributes set by the file wrappers below 769 data_encoding = 'unknown' 770 file_encoding = 'unknown' 771 772 def __init__(self, stream, encode, decode, Reader, Writer, 773 errors='strict'): 774 775 """ Creates a StreamRecoder instance which implements a two-way 776 conversion: encode and decode work on the frontend (the 777 data visible to .read() and .write()) while Reader and Writer 778 work on the backend (the data in stream). 779 780 You can use these objects to do transparent 781 transcodings from e.g. latin-1 to utf-8 and back. 782 783 stream must be a file-like object. 784 785 encode and decode must adhere to the Codec interface; Reader and 786 Writer must be factory functions or classes providing the 787 StreamReader and StreamWriter interfaces resp. 788 789 Error handling is done in the same way as defined for the 790 StreamWriter/Readers. 791 792 """ 793 self.stream = stream 794 self.encode = encode 795 self.decode = decode 796 self.reader = Reader(stream, errors) 797 self.writer = Writer(stream, errors) 798 self.errors = errors 799 800 def read(self, size=-1): 801 802 data = self.reader.read(size) 803 data, bytesencoded = self.encode(data, self.errors) 804 return data 805 806 def readline(self, size=None): 807 808 if size is None: 809 data = self.reader.readline() 810 else: 811 data = self.reader.readline(size) 812 data, bytesencoded = self.encode(data, self.errors) 813 return data 814 815 def readlines(self, sizehint=None): 816 817 data = self.reader.read() 818 data, bytesencoded = self.encode(data, self.errors) 819 return data.splitlines(keepends=True) 820 821 def __next__(self): 822 823 """ Return the next decoded line from the input stream.""" 824 data = next(self.reader) 825 data, bytesencoded = self.encode(data, self.errors) 826 return data 827 828 def __iter__(self): 829 return self 830 831 def write(self, data): 832 833 data, bytesdecoded = self.decode(data, self.errors) 834 return self.writer.write(data) 835 836 def writelines(self, list): 837 838 data = ''.join(list) 839 data, bytesdecoded = self.decode(data, self.errors) 840 return self.writer.write(data) 841 842 def reset(self): 843 844 self.reader.reset() 845 self.writer.reset() 846 847 def __getattr__(self, name, 848 getattr=getattr): 849 850 """ Inherit all other methods from the underlying stream. 851 """ 852 return getattr(self.stream, name) 853 854 def __enter__(self): 855 return self 856 857 def __exit__(self, type, value, tb): 858 self.stream.close() 859 860 ### Shortcuts 861 862 def open(filename, mode='r', encoding=None, errors='strict', buffering=1): 863 864 """ Open an encoded file using the given mode and return 865 a wrapped version providing transparent encoding/decoding. 866 867 Note: The wrapped version will only accept the object format 868 defined by the codecs, i.e. Unicode objects for most builtin 869 codecs. Output is also codec dependent and will usually be 870 Unicode as well. 871 872 Underlying encoded files are always opened in binary mode. 873 The default file mode is 'r', meaning to open the file in read mode. 874 875 encoding specifies the encoding which is to be used for the 876 file. 877 878 errors may be given to define the error handling. It defaults 879 to 'strict' which causes ValueErrors to be raised in case an 880 encoding error occurs. 881 882 buffering has the same meaning as for the builtin open() API. 883 It defaults to line buffered. 884 885 The returned wrapped file object provides an extra attribute 886 .encoding which allows querying the used encoding. This 887 attribute is only available if an encoding was specified as 888 parameter. 889 890 """ 891 if encoding is not None and \ 892 'b' not in mode: 893 # Force opening of the file in binary mode 894 mode = mode + 'b' 895 file = builtins.open(filename, mode, buffering) 896 if encoding is None: 897 return file 898 info = lookup(encoding) 899 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 900 # Add attributes to simplify introspection 901 srw.encoding = encoding 902 return srw 903 904 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 905 906 """ Return a wrapped version of file which provides transparent 907 encoding translation. 908 909 Data written to the wrapped file is decoded according 910 to the given data_encoding and then encoded to the underlying 911 file using file_encoding. The intermediate data type 912 will usually be Unicode but depends on the specified codecs. 913 914 Bytes read from the file are decoded using file_encoding and then 915 passed back to the caller encoded using data_encoding. 916 917 If file_encoding is not given, it defaults to data_encoding. 918 919 errors may be given to define the error handling. It defaults 920 to 'strict' which causes ValueErrors to be raised in case an 921 encoding error occurs. 922 923 The returned wrapped file object provides two extra attributes 924 .data_encoding and .file_encoding which reflect the given 925 parameters of the same name. The attributes can be used for 926 introspection by Python programs. 927 928 """ 929 if file_encoding is None: 930 file_encoding = data_encoding 931 data_info = lookup(data_encoding) 932 file_info = lookup(file_encoding) 933 sr = StreamRecoder(file, data_info.encode, data_info.decode, 934 file_info.streamreader, file_info.streamwriter, errors) 935 # Add attributes to simplify introspection 936 sr.data_encoding = data_encoding 937 sr.file_encoding = file_encoding 938 return sr 939 940 ### Helpers for codec lookup 941 942 def getencoder(encoding): 943 944 """ Lookup up the codec for the given encoding and return 945 its encoder function. 946 947 Raises a LookupError in case the encoding cannot be found. 948 949 """ 950 return lookup(encoding).encode 951 952 def getdecoder(encoding): 953 954 """ Lookup up the codec for the given encoding and return 955 its decoder function. 956 957 Raises a LookupError in case the encoding cannot be found. 958 959 """ 960 return lookup(encoding).decode 961 962 def getincrementalencoder(encoding): 963 964 """ Lookup up the codec for the given encoding and return 965 its IncrementalEncoder class or factory function. 966 967 Raises a LookupError in case the encoding cannot be found 968 or the codecs doesn't provide an incremental encoder. 969 970 """ 971 encoder = lookup(encoding).incrementalencoder 972 if encoder is None: 973 raise LookupError(encoding) 974 return encoder 975 976 def getincrementaldecoder(encoding): 977 978 """ Lookup up the codec for the given encoding and return 979 its IncrementalDecoder class or factory function. 980 981 Raises a LookupError in case the encoding cannot be found 982 or the codecs doesn't provide an incremental decoder. 983 984 """ 985 decoder = lookup(encoding).incrementaldecoder 986 if decoder is None: 987 raise LookupError(encoding) 988 return decoder 989 990 def getreader(encoding): 991 992 """ Lookup up the codec for the given encoding and return 993 its StreamReader class or factory function. 994 995 Raises a LookupError in case the encoding cannot be found. 996 997 """ 998 return lookup(encoding).streamreader 999 1000 def getwriter(encoding): 1001 1002 """ Lookup up the codec for the given encoding and return 1003 its StreamWriter class or factory function. 1004 1005 Raises a LookupError in case the encoding cannot be found. 1006 1007 """ 1008 return lookup(encoding).streamwriter 1009 1010 def iterencode(iterator, encoding, errors='strict', **kwargs): 1011 """ 1012 Encoding iterator. 1013 1014 Encodes the input strings from the iterator using an IncrementalEncoder. 1015 1016 errors and kwargs are passed through to the IncrementalEncoder 1017 constructor. 1018 """ 1019 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1020 for input in iterator: 1021 output = encoder.encode(input) 1022 if output: 1023 yield output 1024 output = encoder.encode("", True) 1025 if output: 1026 yield output 1027 1028 def iterdecode(iterator, encoding, errors='strict', **kwargs): 1029 """ 1030 Decoding iterator. 1031 1032 Decodes the input strings from the iterator using an IncrementalDecoder. 1033 1034 errors and kwargs are passed through to the IncrementalDecoder 1035 constructor. 1036 """ 1037 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1038 for input in iterator: 1039 output = decoder.decode(input) 1040 if output: 1041 yield output 1042 output = decoder.decode(b"", True) 1043 if output: 1044 yield output 1045 1046 ### Helpers for charmap-based codecs 1047 1048 def make_identity_dict(rng): 1049 1050 """ make_identity_dict(rng) -> dict 1051 1052 Return a dictionary where elements of the rng sequence are 1053 mapped to themselves. 1054 1055 """ 1056 return {i:i for i in rng} 1057 1058 def make_encoding_map(decoding_map): 1059 1060 """ Creates an encoding map from a decoding map. 1061 1062 If a target mapping in the decoding map occurs multiple 1063 times, then that target is mapped to None (undefined mapping), 1064 causing an exception when encountered by the charmap codec 1065 during translation. 1066 1067 One example where this happens is cp875.py which decodes 1068 multiple character to \\u001a. 1069 1070 """ 1071 m = {} 1072 for k,v in decoding_map.items(): 1073 if not v in m: 1074 m[v] = k 1075 else: 1076 m[v] = None 1077 return m 1078 1079 ### error handlers 1080 1081 try: 1082 strict_errors = lookup_error("strict") 1083 ignore_errors = lookup_error("ignore") 1084 replace_errors = lookup_error("replace") 1085 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1086 backslashreplace_errors = lookup_error("backslashreplace") 1087 namereplace_errors = lookup_error("namereplace") 1088 except LookupError: 1089 # In --disable-unicode builds, these error handler are missing 1090 strict_errors = None 1091 ignore_errors = None 1092 replace_errors = None 1093 xmlcharrefreplace_errors = None 1094 backslashreplace_errors = None 1095 namereplace_errors = None 1096 1097 # Tell modulefinder that using codecs probably needs the encodings 1098 # package 1099 _false = 0 1100 if _false: 1101 import encodings 1102 1103 ### Tests 1104 1105 if __name__ == '__main__': 1106 1107 # Make stdout translate Latin-1 output into UTF-8 output 1108 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1109 1110 # Have stdin translate Latin-1 input into UTF-8 input 1111 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1112