1 """ codecs -- Python Codec Registry, API and helpers. 2 3 4 Written by Marc-Andre Lemburg (mal (at] lemburg.com). 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import __builtin__, sys 11 12 ### Registry and builtin stateless codec functions 13 14 try: 15 from _codecs import * 16 except ImportError, why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "strict_errors", "ignore_errors", "replace_errors", 24 "xmlcharrefreplace_errors", 25 "register_error", "lookup_error"] 26 27 ### Constants 28 29 # 30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 31 # and its possible byte string values 32 # for UTF8/UTF16/UTF32 output and little/big endian machines 33 # 34 35 # UTF-8 36 BOM_UTF8 = '\xef\xbb\xbf' 37 38 # UTF-16, little endian 39 BOM_LE = BOM_UTF16_LE = '\xff\xfe' 40 41 # UTF-16, big endian 42 BOM_BE = BOM_UTF16_BE = '\xfe\xff' 43 44 # UTF-32, little endian 45 BOM_UTF32_LE = '\xff\xfe\x00\x00' 46 47 # UTF-32, big endian 48 BOM_UTF32_BE = '\x00\x00\xfe\xff' 49 50 if sys.byteorder == 'little': 51 52 # UTF-16, native endianness 53 BOM = BOM_UTF16 = BOM_UTF16_LE 54 55 # UTF-32, native endianness 56 BOM_UTF32 = BOM_UTF32_LE 57 58 else: 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_BE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_BE 65 66 # Old broken names (don't use in new code) 67 BOM32_LE = BOM_UTF16_LE 68 BOM32_BE = BOM_UTF16_BE 69 BOM64_LE = BOM_UTF32_LE 70 BOM64_BE = BOM_UTF32_BE 71 72 73 ### Codec base classes (defining the API) 74 75 class CodecInfo(tuple): 76 77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 78 incrementalencoder=None, incrementaldecoder=None, name=None): 79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 80 self.name = name 81 self.encode = encode 82 self.decode = decode 83 self.incrementalencoder = incrementalencoder 84 self.incrementaldecoder = incrementaldecoder 85 self.streamwriter = streamwriter 86 self.streamreader = streamreader 87 return self 88 89 def __repr__(self): 90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) 91 92 class Codec: 93 94 """ Defines the interface for stateless encoders/decoders. 95 96 The .encode()/.decode() methods may use different error 97 handling schemes by providing the errors argument. These 98 string values are predefined: 99 100 'strict' - raise a ValueError error (or a subclass) 101 'ignore' - ignore the character and continue with the next 102 'replace' - replace with a suitable replacement character; 103 Python will use the official U+FFFD REPLACEMENT 104 CHARACTER for the builtin Unicode codecs on 105 decoding and '?' on encoding. 106 'xmlcharrefreplace' - Replace with the appropriate XML 107 character reference (only for encoding). 108 'backslashreplace' - Replace with backslashed escape sequences 109 (only for encoding). 110 111 The set of allowed values can be extended via register_error. 112 113 """ 114 def encode(self, input, errors='strict'): 115 116 """ Encodes the object input and returns a tuple (output 117 object, length consumed). 118 119 errors defines the error handling to apply. It defaults to 120 'strict' handling. 121 122 The method may not store state in the Codec instance. Use 123 StreamCodec for codecs which have to keep state in order to 124 make encoding/decoding efficient. 125 126 The encoder must be able to handle zero length input and 127 return an empty object of the output object type in this 128 situation. 129 130 """ 131 raise NotImplementedError 132 133 def decode(self, input, errors='strict'): 134 135 """ Decodes the object input and returns a tuple (output 136 object, length consumed). 137 138 input must be an object which provides the bf_getreadbuf 139 buffer slot. Python strings, buffer objects and memory 140 mapped files are examples of objects providing this slot. 141 142 errors defines the error handling to apply. It defaults to 143 'strict' handling. 144 145 The method may not store state in the Codec instance. Use 146 StreamCodec for codecs which have to keep state in order to 147 make encoding/decoding efficient. 148 149 The decoder must be able to handle zero length input and 150 return an empty object of the output object type in this 151 situation. 152 153 """ 154 raise NotImplementedError 155 156 class IncrementalEncoder(object): 157 """ 158 An IncrementalEncoder encodes an input in multiple steps. The input can be 159 passed piece by piece to the encode() method. The IncrementalEncoder remembers 160 the state of the Encoding process between calls to encode(). 161 """ 162 def __init__(self, errors='strict'): 163 """ 164 Creates an IncrementalEncoder instance. 165 166 The IncrementalEncoder may use different error handling schemes by 167 providing the errors keyword argument. See the module docstring 168 for a list of possible values. 169 """ 170 self.errors = errors 171 self.buffer = "" 172 173 def encode(self, input, final=False): 174 """ 175 Encodes input and returns the resulting object. 176 """ 177 raise NotImplementedError 178 179 def reset(self): 180 """ 181 Resets the encoder to the initial state. 182 """ 183 184 def getstate(self): 185 """ 186 Return the current state of the encoder. 187 """ 188 return 0 189 190 def setstate(self, state): 191 """ 192 Set the current state of the encoder. state must have been 193 returned by getstate(). 194 """ 195 196 class BufferedIncrementalEncoder(IncrementalEncoder): 197 """ 198 This subclass of IncrementalEncoder can be used as the baseclass for an 199 incremental encoder if the encoder must keep some of the output in a 200 buffer between calls to encode(). 201 """ 202 def __init__(self, errors='strict'): 203 IncrementalEncoder.__init__(self, errors) 204 self.buffer = "" # unencoded input that is kept between calls to encode() 205 206 def _buffer_encode(self, input, errors, final): 207 # Overwrite this method in subclasses: It must encode input 208 # and return an (output, length consumed) tuple 209 raise NotImplementedError 210 211 def encode(self, input, final=False): 212 # encode input (taking the buffer into account) 213 data = self.buffer + input 214 (result, consumed) = self._buffer_encode(data, self.errors, final) 215 # keep unencoded input until the next call 216 self.buffer = data[consumed:] 217 return result 218 219 def reset(self): 220 IncrementalEncoder.reset(self) 221 self.buffer = "" 222 223 def getstate(self): 224 return self.buffer or 0 225 226 def setstate(self, state): 227 self.buffer = state or "" 228 229 class IncrementalDecoder(object): 230 """ 231 An IncrementalDecoder decodes an input in multiple steps. The input can be 232 passed piece by piece to the decode() method. The IncrementalDecoder 233 remembers the state of the decoding process between calls to decode(). 234 """ 235 def __init__(self, errors='strict'): 236 """ 237 Creates a IncrementalDecoder instance. 238 239 The IncrementalDecoder may use different error handling schemes by 240 providing the errors keyword argument. See the module docstring 241 for a list of possible values. 242 """ 243 self.errors = errors 244 245 def decode(self, input, final=False): 246 """ 247 Decodes input and returns the resulting object. 248 """ 249 raise NotImplementedError 250 251 def reset(self): 252 """ 253 Resets the decoder to the initial state. 254 """ 255 256 def getstate(self): 257 """ 258 Return the current state of the decoder. 259 260 This must be a (buffered_input, additional_state_info) tuple. 261 buffered_input must be a bytes object containing bytes that 262 were passed to decode() that have not yet been converted. 263 additional_state_info must be a non-negative integer 264 representing the state of the decoder WITHOUT yet having 265 processed the contents of buffered_input. In the initial state 266 and after reset(), getstate() must return (b"", 0). 267 """ 268 return (b"", 0) 269 270 def setstate(self, state): 271 """ 272 Set the current state of the decoder. 273 274 state must have been returned by getstate(). The effect of 275 setstate((b"", 0)) must be equivalent to reset(). 276 """ 277 278 class BufferedIncrementalDecoder(IncrementalDecoder): 279 """ 280 This subclass of IncrementalDecoder can be used as the baseclass for an 281 incremental decoder if the decoder must be able to handle incomplete byte 282 sequences. 283 """ 284 def __init__(self, errors='strict'): 285 IncrementalDecoder.__init__(self, errors) 286 self.buffer = "" # undecoded input that is kept between calls to decode() 287 288 def _buffer_decode(self, input, errors, final): 289 # Overwrite this method in subclasses: It must decode input 290 # and return an (output, length consumed) tuple 291 raise NotImplementedError 292 293 def decode(self, input, final=False): 294 # decode input (taking the buffer into account) 295 data = self.buffer + input 296 (result, consumed) = self._buffer_decode(data, self.errors, final) 297 # keep undecoded input until the next call 298 self.buffer = data[consumed:] 299 return result 300 301 def reset(self): 302 IncrementalDecoder.reset(self) 303 self.buffer = "" 304 305 def getstate(self): 306 # additional state info is always 0 307 return (self.buffer, 0) 308 309 def setstate(self, state): 310 # ignore additional state info 311 self.buffer = state[0] 312 313 # 314 # The StreamWriter and StreamReader class provide generic working 315 # interfaces which can be used to implement new encoding submodules 316 # very easily. See encodings/utf_8.py for an example on how this is 317 # done. 318 # 319 320 class StreamWriter(Codec): 321 322 def __init__(self, stream, errors='strict'): 323 324 """ Creates a StreamWriter instance. 325 326 stream must be a file-like object open for writing 327 (binary) data. 328 329 The StreamWriter may use different error handling 330 schemes by providing the errors keyword argument. These 331 parameters are predefined: 332 333 'strict' - raise a ValueError (or a subclass) 334 'ignore' - ignore the character and continue with the next 335 'replace'- replace with a suitable replacement character 336 'xmlcharrefreplace' - Replace with the appropriate XML 337 character reference. 338 'backslashreplace' - Replace with backslashed escape 339 sequences (only for encoding). 340 341 The set of allowed parameter values can be extended via 342 register_error. 343 """ 344 self.stream = stream 345 self.errors = errors 346 347 def write(self, object): 348 349 """ Writes the object's contents encoded to self.stream. 350 """ 351 data, consumed = self.encode(object, self.errors) 352 self.stream.write(data) 353 354 def writelines(self, list): 355 356 """ Writes the concatenated list of strings to the stream 357 using .write(). 358 """ 359 self.write(''.join(list)) 360 361 def reset(self): 362 363 """ Flushes and resets the codec buffers used for keeping state. 364 365 Calling this method should ensure that the data on the 366 output is put into a clean state, that allows appending 367 of new fresh data without having to rescan the whole 368 stream to recover state. 369 370 """ 371 pass 372 373 def seek(self, offset, whence=0): 374 self.stream.seek(offset, whence) 375 if whence == 0 and offset == 0: 376 self.reset() 377 378 def __getattr__(self, name, 379 getattr=getattr): 380 381 """ Inherit all other methods from the underlying stream. 382 """ 383 return getattr(self.stream, name) 384 385 def __enter__(self): 386 return self 387 388 def __exit__(self, type, value, tb): 389 self.stream.close() 390 391 ### 392 393 class StreamReader(Codec): 394 395 def __init__(self, stream, errors='strict'): 396 397 """ Creates a StreamReader instance. 398 399 stream must be a file-like object open for reading 400 (binary) data. 401 402 The StreamReader may use different error handling 403 schemes by providing the errors keyword argument. These 404 parameters are predefined: 405 406 'strict' - raise a ValueError (or a subclass) 407 'ignore' - ignore the character and continue with the next 408 'replace'- replace with a suitable replacement character; 409 410 The set of allowed parameter values can be extended via 411 register_error. 412 """ 413 self.stream = stream 414 self.errors = errors 415 self.bytebuffer = "" 416 # For str->str decoding this will stay a str 417 # For str->unicode decoding the first read will promote it to unicode 418 self.charbuffer = "" 419 self.linebuffer = None 420 421 def decode(self, input, errors='strict'): 422 raise NotImplementedError 423 424 def read(self, size=-1, chars=-1, firstline=False): 425 426 """ Decodes data from the stream self.stream and returns the 427 resulting object. 428 429 chars indicates the number of characters to read from the 430 stream. read() will never return more than chars 431 characters, but it might return less, if there are not enough 432 characters available. 433 434 size indicates the approximate maximum number of bytes to 435 read from the stream for decoding purposes. The decoder 436 can modify this setting as appropriate. The default value 437 -1 indicates to read and decode as much as possible. size 438 is intended to prevent having to decode huge files in one 439 step. 440 441 If firstline is true, and a UnicodeDecodeError happens 442 after the first line terminator in the input only the first line 443 will be returned, the rest of the input will be kept until the 444 next call to read(). 445 446 The method should use a greedy read strategy meaning that 447 it should read as much data as is allowed within the 448 definition of the encoding and the given size, e.g. if 449 optional encoding endings or state markers are available 450 on the stream, these should be read too. 451 """ 452 # If we have lines cached, first merge them back into characters 453 if self.linebuffer: 454 self.charbuffer = "".join(self.linebuffer) 455 self.linebuffer = None 456 457 # read until we get the required number of characters (if available) 458 while True: 459 # can the request can be satisfied from the character buffer? 460 if chars < 0: 461 if size < 0: 462 if self.charbuffer: 463 break 464 elif len(self.charbuffer) >= size: 465 break 466 else: 467 if len(self.charbuffer) >= chars: 468 break 469 # we need more data 470 if size < 0: 471 newdata = self.stream.read() 472 else: 473 newdata = self.stream.read(size) 474 # decode bytes (those remaining from the last call included) 475 data = self.bytebuffer + newdata 476 try: 477 newchars, decodedbytes = self.decode(data, self.errors) 478 except UnicodeDecodeError, exc: 479 if firstline: 480 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) 481 lines = newchars.splitlines(True) 482 if len(lines)<=1: 483 raise 484 else: 485 raise 486 # keep undecoded bytes until the next call 487 self.bytebuffer = data[decodedbytes:] 488 # put new characters in the character buffer 489 self.charbuffer += newchars 490 # there was no data available 491 if not newdata: 492 break 493 if chars < 0: 494 # Return everything we've got 495 result = self.charbuffer 496 self.charbuffer = "" 497 else: 498 # Return the first chars characters 499 result = self.charbuffer[:chars] 500 self.charbuffer = self.charbuffer[chars:] 501 return result 502 503 def readline(self, size=None, keepends=True): 504 505 """ Read one line from the input stream and return the 506 decoded data. 507 508 size, if given, is passed as size argument to the 509 read() method. 510 511 """ 512 # If we have lines cached from an earlier read, return 513 # them unconditionally 514 if self.linebuffer: 515 line = self.linebuffer[0] 516 del self.linebuffer[0] 517 if len(self.linebuffer) == 1: 518 # revert to charbuffer mode; we might need more data 519 # next time 520 self.charbuffer = self.linebuffer[0] 521 self.linebuffer = None 522 if not keepends: 523 line = line.splitlines(False)[0] 524 return line 525 526 readsize = size or 72 527 line = "" 528 # If size is given, we call read() only once 529 while True: 530 data = self.read(readsize, firstline=True) 531 if data: 532 # If we're at a "\r" read one extra character (which might 533 # be a "\n") to get a proper line ending. If the stream is 534 # temporarily exhausted we return the wrong line ending. 535 if data.endswith("\r"): 536 data += self.read(size=1, chars=1) 537 538 line += data 539 lines = line.splitlines(True) 540 if lines: 541 if len(lines) > 1: 542 # More than one line result; the first line is a full line 543 # to return 544 line = lines[0] 545 del lines[0] 546 if len(lines) > 1: 547 # cache the remaining lines 548 lines[-1] += self.charbuffer 549 self.linebuffer = lines 550 self.charbuffer = None 551 else: 552 # only one remaining line, put it back into charbuffer 553 self.charbuffer = lines[0] + self.charbuffer 554 if not keepends: 555 line = line.splitlines(False)[0] 556 break 557 line0withend = lines[0] 558 line0withoutend = lines[0].splitlines(False)[0] 559 if line0withend != line0withoutend: # We really have a line end 560 # Put the rest back together and keep it until the next call 561 self.charbuffer = "".join(lines[1:]) + self.charbuffer 562 if keepends: 563 line = line0withend 564 else: 565 line = line0withoutend 566 break 567 # we didn't get anything or this was our only try 568 if not data or size is not None: 569 if line and not keepends: 570 line = line.splitlines(False)[0] 571 break 572 if readsize<8000: 573 readsize *= 2 574 return line 575 576 def readlines(self, sizehint=None, keepends=True): 577 578 """ Read all lines available on the input stream 579 and return them as list of lines. 580 581 Line breaks are implemented using the codec's decoder 582 method and are included in the list entries. 583 584 sizehint, if given, is ignored since there is no efficient 585 way to finding the true end-of-line. 586 587 """ 588 data = self.read() 589 return data.splitlines(keepends) 590 591 def reset(self): 592 593 """ Resets the codec buffers used for keeping state. 594 595 Note that no stream repositioning should take place. 596 This method is primarily intended to be able to recover 597 from decoding errors. 598 599 """ 600 self.bytebuffer = "" 601 self.charbuffer = u"" 602 self.linebuffer = None 603 604 def seek(self, offset, whence=0): 605 """ Set the input stream's current position. 606 607 Resets the codec buffers used for keeping state. 608 """ 609 self.stream.seek(offset, whence) 610 self.reset() 611 612 def next(self): 613 614 """ Return the next decoded line from the input stream.""" 615 line = self.readline() 616 if line: 617 return line 618 raise StopIteration 619 620 def __iter__(self): 621 return self 622 623 def __getattr__(self, name, 624 getattr=getattr): 625 626 """ Inherit all other methods from the underlying stream. 627 """ 628 return getattr(self.stream, name) 629 630 def __enter__(self): 631 return self 632 633 def __exit__(self, type, value, tb): 634 self.stream.close() 635 636 ### 637 638 class StreamReaderWriter: 639 640 """ StreamReaderWriter instances allow wrapping streams which 641 work in both read and write modes. 642 643 The design is such that one can use the factory functions 644 returned by the codec.lookup() function to construct the 645 instance. 646 647 """ 648 # Optional attributes set by the file wrappers below 649 encoding = 'unknown' 650 651 def __init__(self, stream, Reader, Writer, errors='strict'): 652 653 """ Creates a StreamReaderWriter instance. 654 655 stream must be a Stream-like object. 656 657 Reader, Writer must be factory functions or classes 658 providing the StreamReader, StreamWriter interface resp. 659 660 Error handling is done in the same way as defined for the 661 StreamWriter/Readers. 662 663 """ 664 self.stream = stream 665 self.reader = Reader(stream, errors) 666 self.writer = Writer(stream, errors) 667 self.errors = errors 668 669 def read(self, size=-1): 670 671 return self.reader.read(size) 672 673 def readline(self, size=None): 674 675 return self.reader.readline(size) 676 677 def readlines(self, sizehint=None): 678 679 return self.reader.readlines(sizehint) 680 681 def next(self): 682 683 """ Return the next decoded line from the input stream.""" 684 return self.reader.next() 685 686 def __iter__(self): 687 return self 688 689 def write(self, data): 690 691 return self.writer.write(data) 692 693 def writelines(self, list): 694 695 return self.writer.writelines(list) 696 697 def reset(self): 698 699 self.reader.reset() 700 self.writer.reset() 701 702 def seek(self, offset, whence=0): 703 self.stream.seek(offset, whence) 704 self.reader.reset() 705 if whence == 0 and offset == 0: 706 self.writer.reset() 707 708 def __getattr__(self, name, 709 getattr=getattr): 710 711 """ Inherit all other methods from the underlying stream. 712 """ 713 return getattr(self.stream, name) 714 715 # these are needed to make "with codecs.open(...)" work properly 716 717 def __enter__(self): 718 return self 719 720 def __exit__(self, type, value, tb): 721 self.stream.close() 722 723 ### 724 725 class StreamRecoder: 726 727 """ StreamRecoder instances provide a frontend - backend 728 view of encoding data. 729 730 They use the complete set of APIs returned by the 731 codecs.lookup() function to implement their task. 732 733 Data written to the stream is first decoded into an 734 intermediate format (which is dependent on the given codec 735 combination) and then written to the stream using an instance 736 of the provided Writer class. 737 738 In the other direction, data is read from the stream using a 739 Reader instance and then return encoded data to the caller. 740 741 """ 742 # Optional attributes set by the file wrappers below 743 data_encoding = 'unknown' 744 file_encoding = 'unknown' 745 746 def __init__(self, stream, encode, decode, Reader, Writer, 747 errors='strict'): 748 749 """ Creates a StreamRecoder instance which implements a two-way 750 conversion: encode and decode work on the frontend (the 751 input to .read() and output of .write()) while 752 Reader and Writer work on the backend (reading and 753 writing to the stream). 754 755 You can use these objects to do transparent direct 756 recodings from e.g. latin-1 to utf-8 and back. 757 758 stream must be a file-like object. 759 760 encode, decode must adhere to the Codec interface, Reader, 761 Writer must be factory functions or classes providing the 762 StreamReader, StreamWriter interface resp. 763 764 encode and decode are needed for the frontend translation, 765 Reader and Writer for the backend translation. Unicode is 766 used as intermediate encoding. 767 768 Error handling is done in the same way as defined for the 769 StreamWriter/Readers. 770 771 """ 772 self.stream = stream 773 self.encode = encode 774 self.decode = decode 775 self.reader = Reader(stream, errors) 776 self.writer = Writer(stream, errors) 777 self.errors = errors 778 779 def read(self, size=-1): 780 781 data = self.reader.read(size) 782 data, bytesencoded = self.encode(data, self.errors) 783 return data 784 785 def readline(self, size=None): 786 787 if size is None: 788 data = self.reader.readline() 789 else: 790 data = self.reader.readline(size) 791 data, bytesencoded = self.encode(data, self.errors) 792 return data 793 794 def readlines(self, sizehint=None): 795 796 data = self.reader.read() 797 data, bytesencoded = self.encode(data, self.errors) 798 return data.splitlines(1) 799 800 def next(self): 801 802 """ Return the next decoded line from the input stream.""" 803 data = self.reader.next() 804 data, bytesencoded = self.encode(data, self.errors) 805 return data 806 807 def __iter__(self): 808 return self 809 810 def write(self, data): 811 812 data, bytesdecoded = self.decode(data, self.errors) 813 return self.writer.write(data) 814 815 def writelines(self, list): 816 817 data = ''.join(list) 818 data, bytesdecoded = self.decode(data, self.errors) 819 return self.writer.write(data) 820 821 def reset(self): 822 823 self.reader.reset() 824 self.writer.reset() 825 826 def __getattr__(self, name, 827 getattr=getattr): 828 829 """ Inherit all other methods from the underlying stream. 830 """ 831 return getattr(self.stream, name) 832 833 def __enter__(self): 834 return self 835 836 def __exit__(self, type, value, tb): 837 self.stream.close() 838 839 ### Shortcuts 840 841 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 842 843 """ Open an encoded file using the given mode and return 844 a wrapped version providing transparent encoding/decoding. 845 846 Note: The wrapped version will only accept the object format 847 defined by the codecs, i.e. Unicode objects for most builtin 848 codecs. Output is also codec dependent and will usually be 849 Unicode as well. 850 851 Files are always opened in binary mode, even if no binary mode 852 was specified. This is done to avoid data loss due to encodings 853 using 8-bit values. The default file mode is 'rb' meaning to 854 open the file in binary read mode. 855 856 encoding specifies the encoding which is to be used for the 857 file. 858 859 errors may be given to define the error handling. It defaults 860 to 'strict' which causes ValueErrors to be raised in case an 861 encoding error occurs. 862 863 buffering has the same meaning as for the builtin open() API. 864 It defaults to line buffered. 865 866 The returned wrapped file object provides an extra attribute 867 .encoding which allows querying the used encoding. This 868 attribute is only available if an encoding was specified as 869 parameter. 870 871 """ 872 if encoding is not None: 873 if 'U' in mode: 874 # No automatic conversion of '\n' is done on reading and writing 875 mode = mode.strip().replace('U', '') 876 if mode[:1] not in set('rwa'): 877 mode = 'r' + mode 878 if 'b' not in mode: 879 # Force opening of the file in binary mode 880 mode = mode + 'b' 881 file = __builtin__.open(filename, mode, buffering) 882 if encoding is None: 883 return file 884 info = lookup(encoding) 885 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 886 # Add attributes to simplify introspection 887 srw.encoding = encoding 888 return srw 889 890 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 891 892 """ Return a wrapped version of file which provides transparent 893 encoding translation. 894 895 Strings written to the wrapped file are interpreted according 896 to the given data_encoding and then written to the original 897 file as string using file_encoding. The intermediate encoding 898 will usually be Unicode but depends on the specified codecs. 899 900 Strings are read from the file using file_encoding and then 901 passed back to the caller as string using data_encoding. 902 903 If file_encoding is not given, it defaults to data_encoding. 904 905 errors may be given to define the error handling. It defaults 906 to 'strict' which causes ValueErrors to be raised in case an 907 encoding error occurs. 908 909 The returned wrapped file object provides two extra attributes 910 .data_encoding and .file_encoding which reflect the given 911 parameters of the same name. The attributes can be used for 912 introspection by Python programs. 913 914 """ 915 if file_encoding is None: 916 file_encoding = data_encoding 917 data_info = lookup(data_encoding) 918 file_info = lookup(file_encoding) 919 sr = StreamRecoder(file, data_info.encode, data_info.decode, 920 file_info.streamreader, file_info.streamwriter, errors) 921 # Add attributes to simplify introspection 922 sr.data_encoding = data_encoding 923 sr.file_encoding = file_encoding 924 return sr 925 926 ### Helpers for codec lookup 927 928 def getencoder(encoding): 929 930 """ Lookup up the codec for the given encoding and return 931 its encoder function. 932 933 Raises a LookupError in case the encoding cannot be found. 934 935 """ 936 return lookup(encoding).encode 937 938 def getdecoder(encoding): 939 940 """ Lookup up the codec for the given encoding and return 941 its decoder function. 942 943 Raises a LookupError in case the encoding cannot be found. 944 945 """ 946 return lookup(encoding).decode 947 948 def getincrementalencoder(encoding): 949 950 """ Lookup up the codec for the given encoding and return 951 its IncrementalEncoder class or factory function. 952 953 Raises a LookupError in case the encoding cannot be found 954 or the codecs doesn't provide an incremental encoder. 955 956 """ 957 encoder = lookup(encoding).incrementalencoder 958 if encoder is None: 959 raise LookupError(encoding) 960 return encoder 961 962 def getincrementaldecoder(encoding): 963 964 """ Lookup up the codec for the given encoding and return 965 its IncrementalDecoder class or factory function. 966 967 Raises a LookupError in case the encoding cannot be found 968 or the codecs doesn't provide an incremental decoder. 969 970 """ 971 decoder = lookup(encoding).incrementaldecoder 972 if decoder is None: 973 raise LookupError(encoding) 974 return decoder 975 976 def getreader(encoding): 977 978 """ Lookup up the codec for the given encoding and return 979 its StreamReader class or factory function. 980 981 Raises a LookupError in case the encoding cannot be found. 982 983 """ 984 return lookup(encoding).streamreader 985 986 def getwriter(encoding): 987 988 """ Lookup up the codec for the given encoding and return 989 its StreamWriter class or factory function. 990 991 Raises a LookupError in case the encoding cannot be found. 992 993 """ 994 return lookup(encoding).streamwriter 995 996 def iterencode(iterator, encoding, errors='strict', **kwargs): 997 """ 998 Encoding iterator. 999 1000 Encodes the input strings from the iterator using a IncrementalEncoder. 1001 1002 errors and kwargs are passed through to the IncrementalEncoder 1003 constructor. 1004 """ 1005 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1006 for input in iterator: 1007 output = encoder.encode(input) 1008 if output: 1009 yield output 1010 output = encoder.encode("", True) 1011 if output: 1012 yield output 1013 1014 def iterdecode(iterator, encoding, errors='strict', **kwargs): 1015 """ 1016 Decoding iterator. 1017 1018 Decodes the input strings from the iterator using a IncrementalDecoder. 1019 1020 errors and kwargs are passed through to the IncrementalDecoder 1021 constructor. 1022 """ 1023 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1024 for input in iterator: 1025 output = decoder.decode(input) 1026 if output: 1027 yield output 1028 output = decoder.decode("", True) 1029 if output: 1030 yield output 1031 1032 ### Helpers for charmap-based codecs 1033 1034 def make_identity_dict(rng): 1035 1036 """ make_identity_dict(rng) -> dict 1037 1038 Return a dictionary where elements of the rng sequence are 1039 mapped to themselves. 1040 1041 """ 1042 res = {} 1043 for i in rng: 1044 res[i]=i 1045 return res 1046 1047 def make_encoding_map(decoding_map): 1048 1049 """ Creates an encoding map from a decoding map. 1050 1051 If a target mapping in the decoding map occurs multiple 1052 times, then that target is mapped to None (undefined mapping), 1053 causing an exception when encountered by the charmap codec 1054 during translation. 1055 1056 One example where this happens is cp875.py which decodes 1057 multiple character to \u001a. 1058 1059 """ 1060 m = {} 1061 for k,v in decoding_map.items(): 1062 if not v in m: 1063 m[v] = k 1064 else: 1065 m[v] = None 1066 return m 1067 1068 ### error handlers 1069 1070 try: 1071 strict_errors = lookup_error("strict") 1072 ignore_errors = lookup_error("ignore") 1073 replace_errors = lookup_error("replace") 1074 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1075 backslashreplace_errors = lookup_error("backslashreplace") 1076 except LookupError: 1077 # In --disable-unicode builds, these error handler are missing 1078 strict_errors = None 1079 ignore_errors = None 1080 replace_errors = None 1081 xmlcharrefreplace_errors = None 1082 backslashreplace_errors = None 1083 1084 # Tell modulefinder that using codecs probably needs the encodings 1085 # package 1086 _false = 0 1087 if _false: 1088 import encodings 1089 1090 ### Tests 1091 1092 if __name__ == '__main__': 1093 1094 # Make stdout translate Latin-1 output into UTF-8 output 1095 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1096 1097 # Have stdin translate Latin-1 input into UTF-8 input 1098 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1099