1 """ codecs -- Python Codec Registry, API and helpers. 2 3 4 Written by Marc-Andre Lemburg (mal (at] lemburg.com). 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import __builtin__, sys 11 12 ### Registry and builtin stateless codec functions 13 14 try: 15 from _codecs import * 16 except ImportError, why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 24 "StreamReader", "StreamWriter", 25 "StreamReaderWriter", "StreamRecoder", 26 "getencoder", "getdecoder", "getincrementalencoder", 27 "getincrementaldecoder", "getreader", "getwriter", 28 "encode", "decode", "iterencode", "iterdecode", 29 "strict_errors", "ignore_errors", "replace_errors", 30 "xmlcharrefreplace_errors", "backslashreplace_errors", 31 "register_error", "lookup_error"] 32 33 ### Constants 34 35 # 36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 37 # and its possible byte string values 38 # for UTF8/UTF16/UTF32 output and little/big endian machines 39 # 40 41 # UTF-8 42 BOM_UTF8 = '\xef\xbb\xbf' 43 44 # UTF-16, little endian 45 BOM_LE = BOM_UTF16_LE = '\xff\xfe' 46 47 # UTF-16, big endian 48 BOM_BE = BOM_UTF16_BE = '\xfe\xff' 49 50 # UTF-32, little endian 51 BOM_UTF32_LE = '\xff\xfe\x00\x00' 52 53 # UTF-32, big endian 54 BOM_UTF32_BE = '\x00\x00\xfe\xff' 55 56 if sys.byteorder == 'little': 57 58 # UTF-16, native endianness 59 BOM = BOM_UTF16 = BOM_UTF16_LE 60 61 # UTF-32, native endianness 62 BOM_UTF32 = BOM_UTF32_LE 63 64 else: 65 66 # UTF-16, native endianness 67 BOM = BOM_UTF16 = BOM_UTF16_BE 68 69 # UTF-32, native endianness 70 BOM_UTF32 = BOM_UTF32_BE 71 72 # Old broken names (don't use in new code) 73 BOM32_LE = BOM_UTF16_LE 74 BOM32_BE = BOM_UTF16_BE 75 BOM64_LE = BOM_UTF32_LE 76 BOM64_BE = BOM_UTF32_BE 77 78 79 ### Codec base classes (defining the API) 80 81 class CodecInfo(tuple): 82 """Codec details when looking up the codec registry""" 83 84 # Private API to allow Python to blacklist the known non-Unicode 85 # codecs in the standard library. A more general mechanism to 86 # reliably distinguish test encodings from other codecs will hopefully 87 # be defined for Python 3.5 88 # 89 # See http://bugs.python.org/issue19619 90 _is_text_encoding = True # Assume codecs are text encodings by default 91 92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 93 incrementalencoder=None, incrementaldecoder=None, name=None, 94 _is_text_encoding=None): 95 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 96 self.name = name 97 self.encode = encode 98 self.decode = decode 99 self.incrementalencoder = incrementalencoder 100 self.incrementaldecoder = incrementaldecoder 101 self.streamwriter = streamwriter 102 self.streamreader = streamreader 103 if _is_text_encoding is not None: 104 self._is_text_encoding = _is_text_encoding 105 return self 106 107 def __repr__(self): 108 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) 109 110 class Codec: 111 112 """ Defines the interface for stateless encoders/decoders. 113 114 The .encode()/.decode() methods may use different error 115 handling schemes by providing the errors argument. These 116 string values are predefined: 117 118 'strict' - raise a ValueError error (or a subclass) 119 'ignore' - ignore the character and continue with the next 120 'replace' - replace with a suitable replacement character; 121 Python will use the official U+FFFD REPLACEMENT 122 CHARACTER for the builtin Unicode codecs on 123 decoding and '?' on encoding. 124 'xmlcharrefreplace' - Replace with the appropriate XML 125 character reference (only for encoding). 126 'backslashreplace' - Replace with backslashed escape sequences 127 (only for encoding). 128 129 The set of allowed values can be extended via register_error. 130 131 """ 132 def encode(self, input, errors='strict'): 133 134 """ Encodes the object input and returns a tuple (output 135 object, length consumed). 136 137 errors defines the error handling to apply. It defaults to 138 'strict' handling. 139 140 The method may not store state in the Codec instance. Use 141 StreamWriter for codecs which have to keep state in order to 142 make encoding efficient. 143 144 The encoder must be able to handle zero length input and 145 return an empty object of the output object type in this 146 situation. 147 148 """ 149 raise NotImplementedError 150 151 def decode(self, input, errors='strict'): 152 153 """ Decodes the object input and returns a tuple (output 154 object, length consumed). 155 156 input must be an object which provides the bf_getreadbuf 157 buffer slot. Python strings, buffer objects and memory 158 mapped files are examples of objects providing this slot. 159 160 errors defines the error handling to apply. It defaults to 161 'strict' handling. 162 163 The method may not store state in the Codec instance. Use 164 StreamReader for codecs which have to keep state in order to 165 make decoding efficient. 166 167 The decoder must be able to handle zero length input and 168 return an empty object of the output object type in this 169 situation. 170 171 """ 172 raise NotImplementedError 173 174 class IncrementalEncoder(object): 175 """ 176 An IncrementalEncoder encodes an input in multiple steps. The input can be 177 passed piece by piece to the encode() method. The IncrementalEncoder remembers 178 the state of the Encoding process between calls to encode(). 179 """ 180 def __init__(self, errors='strict'): 181 """ 182 Creates an IncrementalEncoder instance. 183 184 The IncrementalEncoder may use different error handling schemes by 185 providing the errors keyword argument. See the module docstring 186 for a list of possible values. 187 """ 188 self.errors = errors 189 self.buffer = "" 190 191 def encode(self, input, final=False): 192 """ 193 Encodes input and returns the resulting object. 194 """ 195 raise NotImplementedError 196 197 def reset(self): 198 """ 199 Resets the encoder to the initial state. 200 """ 201 202 def getstate(self): 203 """ 204 Return the current state of the encoder. 205 """ 206 return 0 207 208 def setstate(self, state): 209 """ 210 Set the current state of the encoder. state must have been 211 returned by getstate(). 212 """ 213 214 class BufferedIncrementalEncoder(IncrementalEncoder): 215 """ 216 This subclass of IncrementalEncoder can be used as the baseclass for an 217 incremental encoder if the encoder must keep some of the output in a 218 buffer between calls to encode(). 219 """ 220 def __init__(self, errors='strict'): 221 IncrementalEncoder.__init__(self, errors) 222 self.buffer = "" # unencoded input that is kept between calls to encode() 223 224 def _buffer_encode(self, input, errors, final): 225 # Overwrite this method in subclasses: It must encode input 226 # and return an (output, length consumed) tuple 227 raise NotImplementedError 228 229 def encode(self, input, final=False): 230 # encode input (taking the buffer into account) 231 data = self.buffer + input 232 (result, consumed) = self._buffer_encode(data, self.errors, final) 233 # keep unencoded input until the next call 234 self.buffer = data[consumed:] 235 return result 236 237 def reset(self): 238 IncrementalEncoder.reset(self) 239 self.buffer = "" 240 241 def getstate(self): 242 return self.buffer or 0 243 244 def setstate(self, state): 245 self.buffer = state or "" 246 247 class IncrementalDecoder(object): 248 """ 249 An IncrementalDecoder decodes an input in multiple steps. The input can be 250 passed piece by piece to the decode() method. The IncrementalDecoder 251 remembers the state of the decoding process between calls to decode(). 252 """ 253 def __init__(self, errors='strict'): 254 """ 255 Creates an IncrementalDecoder instance. 256 257 The IncrementalDecoder may use different error handling schemes by 258 providing the errors keyword argument. See the module docstring 259 for a list of possible values. 260 """ 261 self.errors = errors 262 263 def decode(self, input, final=False): 264 """ 265 Decodes input and returns the resulting object. 266 """ 267 raise NotImplementedError 268 269 def reset(self): 270 """ 271 Resets the decoder to the initial state. 272 """ 273 274 def getstate(self): 275 """ 276 Return the current state of the decoder. 277 278 This must be a (buffered_input, additional_state_info) tuple. 279 buffered_input must be a bytes object containing bytes that 280 were passed to decode() that have not yet been converted. 281 additional_state_info must be a non-negative integer 282 representing the state of the decoder WITHOUT yet having 283 processed the contents of buffered_input. In the initial state 284 and after reset(), getstate() must return (b"", 0). 285 """ 286 return (b"", 0) 287 288 def setstate(self, state): 289 """ 290 Set the current state of the decoder. 291 292 state must have been returned by getstate(). The effect of 293 setstate((b"", 0)) must be equivalent to reset(). 294 """ 295 296 class BufferedIncrementalDecoder(IncrementalDecoder): 297 """ 298 This subclass of IncrementalDecoder can be used as the baseclass for an 299 incremental decoder if the decoder must be able to handle incomplete byte 300 sequences. 301 """ 302 def __init__(self, errors='strict'): 303 IncrementalDecoder.__init__(self, errors) 304 self.buffer = "" # undecoded input that is kept between calls to decode() 305 306 def _buffer_decode(self, input, errors, final): 307 # Overwrite this method in subclasses: It must decode input 308 # and return an (output, length consumed) tuple 309 raise NotImplementedError 310 311 def decode(self, input, final=False): 312 # decode input (taking the buffer into account) 313 data = self.buffer + input 314 (result, consumed) = self._buffer_decode(data, self.errors, final) 315 # keep undecoded input until the next call 316 self.buffer = data[consumed:] 317 return result 318 319 def reset(self): 320 IncrementalDecoder.reset(self) 321 self.buffer = "" 322 323 def getstate(self): 324 # additional state info is always 0 325 return (self.buffer, 0) 326 327 def setstate(self, state): 328 # ignore additional state info 329 self.buffer = state[0] 330 331 # 332 # The StreamWriter and StreamReader class provide generic working 333 # interfaces which can be used to implement new encoding submodules 334 # very easily. See encodings/utf_8.py for an example on how this is 335 # done. 336 # 337 338 class StreamWriter(Codec): 339 340 def __init__(self, stream, errors='strict'): 341 342 """ Creates a StreamWriter instance. 343 344 stream must be a file-like object open for writing 345 (binary) data. 346 347 The StreamWriter may use different error handling 348 schemes by providing the errors keyword argument. These 349 parameters are predefined: 350 351 'strict' - raise a ValueError (or a subclass) 352 'ignore' - ignore the character and continue with the next 353 'replace'- replace with a suitable replacement character 354 'xmlcharrefreplace' - Replace with the appropriate XML 355 character reference. 356 'backslashreplace' - Replace with backslashed escape 357 sequences (only for encoding). 358 359 The set of allowed parameter values can be extended via 360 register_error. 361 """ 362 self.stream = stream 363 self.errors = errors 364 365 def write(self, object): 366 367 """ Writes the object's contents encoded to self.stream. 368 """ 369 data, consumed = self.encode(object, self.errors) 370 self.stream.write(data) 371 372 def writelines(self, list): 373 374 """ Writes the concatenated list of strings to the stream 375 using .write(). 376 """ 377 self.write(''.join(list)) 378 379 def reset(self): 380 381 """ Flushes and resets the codec buffers used for keeping state. 382 383 Calling this method should ensure that the data on the 384 output is put into a clean state, that allows appending 385 of new fresh data without having to rescan the whole 386 stream to recover state. 387 388 """ 389 pass 390 391 def seek(self, offset, whence=0): 392 self.stream.seek(offset, whence) 393 if whence == 0 and offset == 0: 394 self.reset() 395 396 def __getattr__(self, name, 397 getattr=getattr): 398 399 """ Inherit all other methods from the underlying stream. 400 """ 401 return getattr(self.stream, name) 402 403 def __enter__(self): 404 return self 405 406 def __exit__(self, type, value, tb): 407 self.stream.close() 408 409 ### 410 411 class StreamReader(Codec): 412 413 def __init__(self, stream, errors='strict'): 414 415 """ Creates a StreamReader instance. 416 417 stream must be a file-like object open for reading 418 (binary) data. 419 420 The StreamReader may use different error handling 421 schemes by providing the errors keyword argument. These 422 parameters are predefined: 423 424 'strict' - raise a ValueError (or a subclass) 425 'ignore' - ignore the character and continue with the next 426 'replace'- replace with a suitable replacement character; 427 428 The set of allowed parameter values can be extended via 429 register_error. 430 """ 431 self.stream = stream 432 self.errors = errors 433 self.bytebuffer = "" 434 # For str->str decoding this will stay a str 435 # For str->unicode decoding the first read will promote it to unicode 436 self.charbuffer = "" 437 self.linebuffer = None 438 439 def decode(self, input, errors='strict'): 440 raise NotImplementedError 441 442 def read(self, size=-1, chars=-1, firstline=False): 443 444 """ Decodes data from the stream self.stream and returns the 445 resulting object. 446 447 chars indicates the number of characters to read from the 448 stream. read() will never return more than chars 449 characters, but it might return less, if there are not enough 450 characters available. 451 452 size indicates the approximate maximum number of bytes to 453 read from the stream for decoding purposes. The decoder 454 can modify this setting as appropriate. The default value 455 -1 indicates to read and decode as much as possible. size 456 is intended to prevent having to decode huge files in one 457 step. 458 459 If firstline is true, and a UnicodeDecodeError happens 460 after the first line terminator in the input only the first line 461 will be returned, the rest of the input will be kept until the 462 next call to read(). 463 464 The method should use a greedy read strategy meaning that 465 it should read as much data as is allowed within the 466 definition of the encoding and the given size, e.g. if 467 optional encoding endings or state markers are available 468 on the stream, these should be read too. 469 """ 470 # If we have lines cached, first merge them back into characters 471 if self.linebuffer: 472 self.charbuffer = "".join(self.linebuffer) 473 self.linebuffer = None 474 475 # read until we get the required number of characters (if available) 476 while True: 477 # can the request be satisfied from the character buffer? 478 if chars >= 0: 479 if len(self.charbuffer) >= chars: 480 break 481 elif size >= 0: 482 if len(self.charbuffer) >= size: 483 break 484 # we need more data 485 if size < 0: 486 newdata = self.stream.read() 487 else: 488 newdata = self.stream.read(size) 489 # decode bytes (those remaining from the last call included) 490 data = self.bytebuffer + newdata 491 try: 492 newchars, decodedbytes = self.decode(data, self.errors) 493 except UnicodeDecodeError, exc: 494 if firstline: 495 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) 496 lines = newchars.splitlines(True) 497 if len(lines)<=1: 498 raise 499 else: 500 raise 501 # keep undecoded bytes until the next call 502 self.bytebuffer = data[decodedbytes:] 503 # put new characters in the character buffer 504 self.charbuffer += newchars 505 # there was no data available 506 if not newdata: 507 break 508 if chars < 0: 509 # Return everything we've got 510 result = self.charbuffer 511 self.charbuffer = "" 512 else: 513 # Return the first chars characters 514 result = self.charbuffer[:chars] 515 self.charbuffer = self.charbuffer[chars:] 516 return result 517 518 def readline(self, size=None, keepends=True): 519 520 """ Read one line from the input stream and return the 521 decoded data. 522 523 size, if given, is passed as size argument to the 524 read() method. 525 526 """ 527 # If we have lines cached from an earlier read, return 528 # them unconditionally 529 if self.linebuffer: 530 line = self.linebuffer[0] 531 del self.linebuffer[0] 532 if len(self.linebuffer) == 1: 533 # revert to charbuffer mode; we might need more data 534 # next time 535 self.charbuffer = self.linebuffer[0] 536 self.linebuffer = None 537 if not keepends: 538 line = line.splitlines(False)[0] 539 return line 540 541 readsize = size or 72 542 line = "" 543 # If size is given, we call read() only once 544 while True: 545 data = self.read(readsize, firstline=True) 546 if data: 547 # If we're at a "\r" read one extra character (which might 548 # be a "\n") to get a proper line ending. If the stream is 549 # temporarily exhausted we return the wrong line ending. 550 if data.endswith("\r"): 551 data += self.read(size=1, chars=1) 552 553 line += data 554 lines = line.splitlines(True) 555 if lines: 556 if len(lines) > 1: 557 # More than one line result; the first line is a full line 558 # to return 559 line = lines[0] 560 del lines[0] 561 if len(lines) > 1: 562 # cache the remaining lines 563 lines[-1] += self.charbuffer 564 self.linebuffer = lines 565 self.charbuffer = None 566 else: 567 # only one remaining line, put it back into charbuffer 568 self.charbuffer = lines[0] + self.charbuffer 569 if not keepends: 570 line = line.splitlines(False)[0] 571 break 572 line0withend = lines[0] 573 line0withoutend = lines[0].splitlines(False)[0] 574 if line0withend != line0withoutend: # We really have a line end 575 # Put the rest back together and keep it until the next call 576 self.charbuffer = "".join(lines[1:]) + self.charbuffer 577 if keepends: 578 line = line0withend 579 else: 580 line = line0withoutend 581 break 582 # we didn't get anything or this was our only try 583 if not data or size is not None: 584 if line and not keepends: 585 line = line.splitlines(False)[0] 586 break 587 if readsize<8000: 588 readsize *= 2 589 return line 590 591 def readlines(self, sizehint=None, keepends=True): 592 593 """ Read all lines available on the input stream 594 and return them as list of lines. 595 596 Line breaks are implemented using the codec's decoder 597 method and are included in the list entries. 598 599 sizehint, if given, is ignored since there is no efficient 600 way to finding the true end-of-line. 601 602 """ 603 data = self.read() 604 return data.splitlines(keepends) 605 606 def reset(self): 607 608 """ Resets the codec buffers used for keeping state. 609 610 Note that no stream repositioning should take place. 611 This method is primarily intended to be able to recover 612 from decoding errors. 613 614 """ 615 self.bytebuffer = "" 616 self.charbuffer = u"" 617 self.linebuffer = None 618 619 def seek(self, offset, whence=0): 620 """ Set the input stream's current position. 621 622 Resets the codec buffers used for keeping state. 623 """ 624 self.stream.seek(offset, whence) 625 self.reset() 626 627 def next(self): 628 629 """ Return the next decoded line from the input stream.""" 630 line = self.readline() 631 if line: 632 return line 633 raise StopIteration 634 635 def __iter__(self): 636 return self 637 638 def __getattr__(self, name, 639 getattr=getattr): 640 641 """ Inherit all other methods from the underlying stream. 642 """ 643 return getattr(self.stream, name) 644 645 def __enter__(self): 646 return self 647 648 def __exit__(self, type, value, tb): 649 self.stream.close() 650 651 ### 652 653 class StreamReaderWriter: 654 655 """ StreamReaderWriter instances allow wrapping streams which 656 work in both read and write modes. 657 658 The design is such that one can use the factory functions 659 returned by the codec.lookup() function to construct the 660 instance. 661 662 """ 663 # Optional attributes set by the file wrappers below 664 encoding = 'unknown' 665 666 def __init__(self, stream, Reader, Writer, errors='strict'): 667 668 """ Creates a StreamReaderWriter instance. 669 670 stream must be a Stream-like object. 671 672 Reader, Writer must be factory functions or classes 673 providing the StreamReader, StreamWriter interface resp. 674 675 Error handling is done in the same way as defined for the 676 StreamWriter/Readers. 677 678 """ 679 self.stream = stream 680 self.reader = Reader(stream, errors) 681 self.writer = Writer(stream, errors) 682 self.errors = errors 683 684 def read(self, size=-1): 685 686 return self.reader.read(size) 687 688 def readline(self, size=None): 689 690 return self.reader.readline(size) 691 692 def readlines(self, sizehint=None): 693 694 return self.reader.readlines(sizehint) 695 696 def next(self): 697 698 """ Return the next decoded line from the input stream.""" 699 return self.reader.next() 700 701 def __iter__(self): 702 return self 703 704 def write(self, data): 705 706 return self.writer.write(data) 707 708 def writelines(self, list): 709 710 return self.writer.writelines(list) 711 712 def reset(self): 713 714 self.reader.reset() 715 self.writer.reset() 716 717 def seek(self, offset, whence=0): 718 self.stream.seek(offset, whence) 719 self.reader.reset() 720 if whence == 0 and offset == 0: 721 self.writer.reset() 722 723 def __getattr__(self, name, 724 getattr=getattr): 725 726 """ Inherit all other methods from the underlying stream. 727 """ 728 return getattr(self.stream, name) 729 730 # these are needed to make "with codecs.open(...)" work properly 731 732 def __enter__(self): 733 return self 734 735 def __exit__(self, type, value, tb): 736 self.stream.close() 737 738 ### 739 740 class StreamRecoder: 741 742 """ StreamRecoder instances provide a frontend - backend 743 view of encoding data. 744 745 They use the complete set of APIs returned by the 746 codecs.lookup() function to implement their task. 747 748 Data written to the stream is first decoded into an 749 intermediate format (which is dependent on the given codec 750 combination) and then written to the stream using an instance 751 of the provided Writer class. 752 753 In the other direction, data is read from the stream using a 754 Reader instance and then return encoded data to the caller. 755 756 """ 757 # Optional attributes set by the file wrappers below 758 data_encoding = 'unknown' 759 file_encoding = 'unknown' 760 761 def __init__(self, stream, encode, decode, Reader, Writer, 762 errors='strict'): 763 764 """ Creates a StreamRecoder instance which implements a two-way 765 conversion: encode and decode work on the frontend (the 766 input to .read() and output of .write()) while 767 Reader and Writer work on the backend (reading and 768 writing to the stream). 769 770 You can use these objects to do transparent direct 771 recodings from e.g. latin-1 to utf-8 and back. 772 773 stream must be a file-like object. 774 775 encode, decode must adhere to the Codec interface, Reader, 776 Writer must be factory functions or classes providing the 777 StreamReader, StreamWriter interface resp. 778 779 encode and decode are needed for the frontend translation, 780 Reader and Writer for the backend translation. Unicode is 781 used as intermediate encoding. 782 783 Error handling is done in the same way as defined for the 784 StreamWriter/Readers. 785 786 """ 787 self.stream = stream 788 self.encode = encode 789 self.decode = decode 790 self.reader = Reader(stream, errors) 791 self.writer = Writer(stream, errors) 792 self.errors = errors 793 794 def read(self, size=-1): 795 796 data = self.reader.read(size) 797 data, bytesencoded = self.encode(data, self.errors) 798 return data 799 800 def readline(self, size=None): 801 802 if size is None: 803 data = self.reader.readline() 804 else: 805 data = self.reader.readline(size) 806 data, bytesencoded = self.encode(data, self.errors) 807 return data 808 809 def readlines(self, sizehint=None): 810 811 data = self.reader.read() 812 data, bytesencoded = self.encode(data, self.errors) 813 return data.splitlines(1) 814 815 def next(self): 816 817 """ Return the next decoded line from the input stream.""" 818 data = self.reader.next() 819 data, bytesencoded = self.encode(data, self.errors) 820 return data 821 822 def __iter__(self): 823 return self 824 825 def write(self, data): 826 827 data, bytesdecoded = self.decode(data, self.errors) 828 return self.writer.write(data) 829 830 def writelines(self, list): 831 832 data = ''.join(list) 833 data, bytesdecoded = self.decode(data, self.errors) 834 return self.writer.write(data) 835 836 def reset(self): 837 838 self.reader.reset() 839 self.writer.reset() 840 841 def __getattr__(self, name, 842 getattr=getattr): 843 844 """ Inherit all other methods from the underlying stream. 845 """ 846 return getattr(self.stream, name) 847 848 def __enter__(self): 849 return self 850 851 def __exit__(self, type, value, tb): 852 self.stream.close() 853 854 ### Shortcuts 855 856 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 857 858 """ Open an encoded file using the given mode and return 859 a wrapped version providing transparent encoding/decoding. 860 861 Note: The wrapped version will only accept the object format 862 defined by the codecs, i.e. Unicode objects for most builtin 863 codecs. Output is also codec dependent and will usually be 864 Unicode as well. 865 866 Files are always opened in binary mode, even if no binary mode 867 was specified. This is done to avoid data loss due to encodings 868 using 8-bit values. The default file mode is 'rb' meaning to 869 open the file in binary read mode. 870 871 encoding specifies the encoding which is to be used for the 872 file. 873 874 errors may be given to define the error handling. It defaults 875 to 'strict' which causes ValueErrors to be raised in case an 876 encoding error occurs. 877 878 buffering has the same meaning as for the builtin open() API. 879 It defaults to line buffered. 880 881 The returned wrapped file object provides an extra attribute 882 .encoding which allows querying the used encoding. This 883 attribute is only available if an encoding was specified as 884 parameter. 885 886 """ 887 if encoding is not None: 888 if 'U' in mode: 889 # No automatic conversion of '\n' is done on reading and writing 890 mode = mode.strip().replace('U', '') 891 if mode[:1] not in set('rwa'): 892 mode = 'r' + mode 893 if 'b' not in mode: 894 # Force opening of the file in binary mode 895 mode = mode + 'b' 896 file = __builtin__.open(filename, mode, buffering) 897 if encoding is None: 898 return file 899 info = lookup(encoding) 900 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 901 # Add attributes to simplify introspection 902 srw.encoding = encoding 903 return srw 904 905 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 906 907 """ Return a wrapped version of file which provides transparent 908 encoding translation. 909 910 Strings written to the wrapped file are interpreted according 911 to the given data_encoding and then written to the original 912 file as string using file_encoding. The intermediate encoding 913 will usually be Unicode but depends on the specified codecs. 914 915 Strings are read from the file using file_encoding and then 916 passed back to the caller as string using data_encoding. 917 918 If file_encoding is not given, it defaults to data_encoding. 919 920 errors may be given to define the error handling. It defaults 921 to 'strict' which causes ValueErrors to be raised in case an 922 encoding error occurs. 923 924 The returned wrapped file object provides two extra attributes 925 .data_encoding and .file_encoding which reflect the given 926 parameters of the same name. The attributes can be used for 927 introspection by Python programs. 928 929 """ 930 if file_encoding is None: 931 file_encoding = data_encoding 932 data_info = lookup(data_encoding) 933 file_info = lookup(file_encoding) 934 sr = StreamRecoder(file, data_info.encode, data_info.decode, 935 file_info.streamreader, file_info.streamwriter, errors) 936 # Add attributes to simplify introspection 937 sr.data_encoding = data_encoding 938 sr.file_encoding = file_encoding 939 return sr 940 941 ### Helpers for codec lookup 942 943 def getencoder(encoding): 944 945 """ Lookup up the codec for the given encoding and return 946 its encoder function. 947 948 Raises a LookupError in case the encoding cannot be found. 949 950 """ 951 return lookup(encoding).encode 952 953 def getdecoder(encoding): 954 955 """ Lookup up the codec for the given encoding and return 956 its decoder function. 957 958 Raises a LookupError in case the encoding cannot be found. 959 960 """ 961 return lookup(encoding).decode 962 963 def getincrementalencoder(encoding): 964 965 """ Lookup up the codec for the given encoding and return 966 its IncrementalEncoder class or factory function. 967 968 Raises a LookupError in case the encoding cannot be found 969 or the codecs doesn't provide an incremental encoder. 970 971 """ 972 encoder = lookup(encoding).incrementalencoder 973 if encoder is None: 974 raise LookupError(encoding) 975 return encoder 976 977 def getincrementaldecoder(encoding): 978 979 """ Lookup up the codec for the given encoding and return 980 its IncrementalDecoder class or factory function. 981 982 Raises a LookupError in case the encoding cannot be found 983 or the codecs doesn't provide an incremental decoder. 984 985 """ 986 decoder = lookup(encoding).incrementaldecoder 987 if decoder is None: 988 raise LookupError(encoding) 989 return decoder 990 991 def getreader(encoding): 992 993 """ Lookup up the codec for the given encoding and return 994 its StreamReader class or factory function. 995 996 Raises a LookupError in case the encoding cannot be found. 997 998 """ 999 return lookup(encoding).streamreader 1000 1001 def getwriter(encoding): 1002 1003 """ Lookup up the codec for the given encoding and return 1004 its StreamWriter class or factory function. 1005 1006 Raises a LookupError in case the encoding cannot be found. 1007 1008 """ 1009 return lookup(encoding).streamwriter 1010 1011 def iterencode(iterator, encoding, errors='strict', **kwargs): 1012 """ 1013 Encoding iterator. 1014 1015 Encodes the input strings from the iterator using an IncrementalEncoder. 1016 1017 errors and kwargs are passed through to the IncrementalEncoder 1018 constructor. 1019 """ 1020 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1021 for input in iterator: 1022 output = encoder.encode(input) 1023 if output: 1024 yield output 1025 output = encoder.encode("", True) 1026 if output: 1027 yield output 1028 1029 def iterdecode(iterator, encoding, errors='strict', **kwargs): 1030 """ 1031 Decoding iterator. 1032 1033 Decodes the input strings from the iterator using an IncrementalDecoder. 1034 1035 errors and kwargs are passed through to the IncrementalDecoder 1036 constructor. 1037 """ 1038 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1039 for input in iterator: 1040 output = decoder.decode(input) 1041 if output: 1042 yield output 1043 output = decoder.decode("", True) 1044 if output: 1045 yield output 1046 1047 ### Helpers for charmap-based codecs 1048 1049 def make_identity_dict(rng): 1050 1051 """ make_identity_dict(rng) -> dict 1052 1053 Return a dictionary where elements of the rng sequence are 1054 mapped to themselves. 1055 1056 """ 1057 res = {} 1058 for i in rng: 1059 res[i]=i 1060 return res 1061 1062 def make_encoding_map(decoding_map): 1063 1064 """ Creates an encoding map from a decoding map. 1065 1066 If a target mapping in the decoding map occurs multiple 1067 times, then that target is mapped to None (undefined mapping), 1068 causing an exception when encountered by the charmap codec 1069 during translation. 1070 1071 One example where this happens is cp875.py which decodes 1072 multiple character to \\u001a. 1073 1074 """ 1075 m = {} 1076 for k,v in decoding_map.items(): 1077 if not v in m: 1078 m[v] = k 1079 else: 1080 m[v] = None 1081 return m 1082 1083 ### error handlers 1084 1085 try: 1086 strict_errors = lookup_error("strict") 1087 ignore_errors = lookup_error("ignore") 1088 replace_errors = lookup_error("replace") 1089 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1090 backslashreplace_errors = lookup_error("backslashreplace") 1091 except LookupError: 1092 # In --disable-unicode builds, these error handler are missing 1093 strict_errors = None 1094 ignore_errors = None 1095 replace_errors = None 1096 xmlcharrefreplace_errors = None 1097 backslashreplace_errors = None 1098 1099 # Tell modulefinder that using codecs probably needs the encodings 1100 # package 1101 _false = 0 1102 if _false: 1103 import encodings 1104 1105 ### Tests 1106 1107 if __name__ == '__main__': 1108 1109 # Make stdout translate Latin-1 output into UTF-8 output 1110 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1111 1112 # Have stdin translate Latin-1 input into UTF-8 input 1113 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1114