1 """ codecs -- Python Codec Registry, API and helpers. 2 3 4 Written by Marc-Andre Lemburg (mal (at] lemburg.com). 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import __builtin__, sys 11 12 ### Registry and builtin stateless codec functions 13 14 try: 15 from _codecs import * 16 except ImportError, why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 24 "StreamReader", "StreamWriter", 25 "StreamReaderWriter", "StreamRecoder", 26 "getencoder", "getdecoder", "getincrementalencoder", 27 "getincrementaldecoder", "getreader", "getwriter", 28 "encode", "decode", "iterencode", "iterdecode", 29 "strict_errors", "ignore_errors", "replace_errors", 30 "xmlcharrefreplace_errors", "backslashreplace_errors", 31 "register_error", "lookup_error"] 32 33 ### Constants 34 35 # 36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 37 # and its possible byte string values 38 # for UTF8/UTF16/UTF32 output and little/big endian machines 39 # 40 41 # UTF-8 42 BOM_UTF8 = '\xef\xbb\xbf' 43 44 # UTF-16, little endian 45 BOM_LE = BOM_UTF16_LE = '\xff\xfe' 46 47 # UTF-16, big endian 48 BOM_BE = BOM_UTF16_BE = '\xfe\xff' 49 50 # UTF-32, little endian 51 BOM_UTF32_LE = '\xff\xfe\x00\x00' 52 53 # UTF-32, big endian 54 BOM_UTF32_BE = '\x00\x00\xfe\xff' 55 56 if sys.byteorder == 'little': 57 58 # UTF-16, native endianness 59 BOM = BOM_UTF16 = BOM_UTF16_LE 60 61 # UTF-32, native endianness 62 BOM_UTF32 = BOM_UTF32_LE 63 64 else: 65 66 # UTF-16, native endianness 67 BOM = BOM_UTF16 = BOM_UTF16_BE 68 69 # UTF-32, native endianness 70 BOM_UTF32 = BOM_UTF32_BE 71 72 # Old broken names (don't use in new code) 73 BOM32_LE = BOM_UTF16_LE 74 BOM32_BE = BOM_UTF16_BE 75 BOM64_LE = BOM_UTF32_LE 76 BOM64_BE = BOM_UTF32_BE 77 78 79 ### Codec base classes (defining the API) 80 81 class CodecInfo(tuple): 82 83 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 84 incrementalencoder=None, incrementaldecoder=None, name=None): 85 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 86 self.name = name 87 self.encode = encode 88 self.decode = decode 89 self.incrementalencoder = incrementalencoder 90 self.incrementaldecoder = incrementaldecoder 91 self.streamwriter = streamwriter 92 self.streamreader = streamreader 93 return self 94 95 def __repr__(self): 96 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) 97 98 class Codec: 99 100 """ Defines the interface for stateless encoders/decoders. 101 102 The .encode()/.decode() methods may use different error 103 handling schemes by providing the errors argument. These 104 string values are predefined: 105 106 'strict' - raise a ValueError error (or a subclass) 107 'ignore' - ignore the character and continue with the next 108 'replace' - replace with a suitable replacement character; 109 Python will use the official U+FFFD REPLACEMENT 110 CHARACTER for the builtin Unicode codecs on 111 decoding and '?' on encoding. 112 'xmlcharrefreplace' - Replace with the appropriate XML 113 character reference (only for encoding). 114 'backslashreplace' - Replace with backslashed escape sequences 115 (only for encoding). 116 117 The set of allowed values can be extended via register_error. 118 119 """ 120 def encode(self, input, errors='strict'): 121 122 """ Encodes the object input and returns a tuple (output 123 object, length consumed). 124 125 errors defines the error handling to apply. It defaults to 126 'strict' handling. 127 128 The method may not store state in the Codec instance. Use 129 StreamCodec for codecs which have to keep state in order to 130 make encoding/decoding efficient. 131 132 The encoder must be able to handle zero length input and 133 return an empty object of the output object type in this 134 situation. 135 136 """ 137 raise NotImplementedError 138 139 def decode(self, input, errors='strict'): 140 141 """ Decodes the object input and returns a tuple (output 142 object, length consumed). 143 144 input must be an object which provides the bf_getreadbuf 145 buffer slot. Python strings, buffer objects and memory 146 mapped files are examples of objects providing this slot. 147 148 errors defines the error handling to apply. It defaults to 149 'strict' handling. 150 151 The method may not store state in the Codec instance. Use 152 StreamCodec for codecs which have to keep state in order to 153 make encoding/decoding efficient. 154 155 The decoder must be able to handle zero length input and 156 return an empty object of the output object type in this 157 situation. 158 159 """ 160 raise NotImplementedError 161 162 class IncrementalEncoder(object): 163 """ 164 An IncrementalEncoder encodes an input in multiple steps. The input can be 165 passed piece by piece to the encode() method. The IncrementalEncoder remembers 166 the state of the Encoding process between calls to encode(). 167 """ 168 def __init__(self, errors='strict'): 169 """ 170 Creates an IncrementalEncoder instance. 171 172 The IncrementalEncoder may use different error handling schemes by 173 providing the errors keyword argument. See the module docstring 174 for a list of possible values. 175 """ 176 self.errors = errors 177 self.buffer = "" 178 179 def encode(self, input, final=False): 180 """ 181 Encodes input and returns the resulting object. 182 """ 183 raise NotImplementedError 184 185 def reset(self): 186 """ 187 Resets the encoder to the initial state. 188 """ 189 190 def getstate(self): 191 """ 192 Return the current state of the encoder. 193 """ 194 return 0 195 196 def setstate(self, state): 197 """ 198 Set the current state of the encoder. state must have been 199 returned by getstate(). 200 """ 201 202 class BufferedIncrementalEncoder(IncrementalEncoder): 203 """ 204 This subclass of IncrementalEncoder can be used as the baseclass for an 205 incremental encoder if the encoder must keep some of the output in a 206 buffer between calls to encode(). 207 """ 208 def __init__(self, errors='strict'): 209 IncrementalEncoder.__init__(self, errors) 210 self.buffer = "" # unencoded input that is kept between calls to encode() 211 212 def _buffer_encode(self, input, errors, final): 213 # Overwrite this method in subclasses: It must encode input 214 # and return an (output, length consumed) tuple 215 raise NotImplementedError 216 217 def encode(self, input, final=False): 218 # encode input (taking the buffer into account) 219 data = self.buffer + input 220 (result, consumed) = self._buffer_encode(data, self.errors, final) 221 # keep unencoded input until the next call 222 self.buffer = data[consumed:] 223 return result 224 225 def reset(self): 226 IncrementalEncoder.reset(self) 227 self.buffer = "" 228 229 def getstate(self): 230 return self.buffer or 0 231 232 def setstate(self, state): 233 self.buffer = state or "" 234 235 class IncrementalDecoder(object): 236 """ 237 An IncrementalDecoder decodes an input in multiple steps. The input can be 238 passed piece by piece to the decode() method. The IncrementalDecoder 239 remembers the state of the decoding process between calls to decode(). 240 """ 241 def __init__(self, errors='strict'): 242 """ 243 Creates a IncrementalDecoder instance. 244 245 The IncrementalDecoder may use different error handling schemes by 246 providing the errors keyword argument. See the module docstring 247 for a list of possible values. 248 """ 249 self.errors = errors 250 251 def decode(self, input, final=False): 252 """ 253 Decodes input and returns the resulting object. 254 """ 255 raise NotImplementedError 256 257 def reset(self): 258 """ 259 Resets the decoder to the initial state. 260 """ 261 262 def getstate(self): 263 """ 264 Return the current state of the decoder. 265 266 This must be a (buffered_input, additional_state_info) tuple. 267 buffered_input must be a bytes object containing bytes that 268 were passed to decode() that have not yet been converted. 269 additional_state_info must be a non-negative integer 270 representing the state of the decoder WITHOUT yet having 271 processed the contents of buffered_input. In the initial state 272 and after reset(), getstate() must return (b"", 0). 273 """ 274 return (b"", 0) 275 276 def setstate(self, state): 277 """ 278 Set the current state of the decoder. 279 280 state must have been returned by getstate(). The effect of 281 setstate((b"", 0)) must be equivalent to reset(). 282 """ 283 284 class BufferedIncrementalDecoder(IncrementalDecoder): 285 """ 286 This subclass of IncrementalDecoder can be used as the baseclass for an 287 incremental decoder if the decoder must be able to handle incomplete byte 288 sequences. 289 """ 290 def __init__(self, errors='strict'): 291 IncrementalDecoder.__init__(self, errors) 292 self.buffer = "" # undecoded input that is kept between calls to decode() 293 294 def _buffer_decode(self, input, errors, final): 295 # Overwrite this method in subclasses: It must decode input 296 # and return an (output, length consumed) tuple 297 raise NotImplementedError 298 299 def decode(self, input, final=False): 300 # decode input (taking the buffer into account) 301 data = self.buffer + input 302 (result, consumed) = self._buffer_decode(data, self.errors, final) 303 # keep undecoded input until the next call 304 self.buffer = data[consumed:] 305 return result 306 307 def reset(self): 308 IncrementalDecoder.reset(self) 309 self.buffer = "" 310 311 def getstate(self): 312 # additional state info is always 0 313 return (self.buffer, 0) 314 315 def setstate(self, state): 316 # ignore additional state info 317 self.buffer = state[0] 318 319 # 320 # The StreamWriter and StreamReader class provide generic working 321 # interfaces which can be used to implement new encoding submodules 322 # very easily. See encodings/utf_8.py for an example on how this is 323 # done. 324 # 325 326 class StreamWriter(Codec): 327 328 def __init__(self, stream, errors='strict'): 329 330 """ Creates a StreamWriter instance. 331 332 stream must be a file-like object open for writing 333 (binary) data. 334 335 The StreamWriter may use different error handling 336 schemes by providing the errors keyword argument. These 337 parameters are predefined: 338 339 'strict' - raise a ValueError (or a subclass) 340 'ignore' - ignore the character and continue with the next 341 'replace'- replace with a suitable replacement character 342 'xmlcharrefreplace' - Replace with the appropriate XML 343 character reference. 344 'backslashreplace' - Replace with backslashed escape 345 sequences (only for encoding). 346 347 The set of allowed parameter values can be extended via 348 register_error. 349 """ 350 self.stream = stream 351 self.errors = errors 352 353 def write(self, object): 354 355 """ Writes the object's contents encoded to self.stream. 356 """ 357 data, consumed = self.encode(object, self.errors) 358 self.stream.write(data) 359 360 def writelines(self, list): 361 362 """ Writes the concatenated list of strings to the stream 363 using .write(). 364 """ 365 self.write(''.join(list)) 366 367 def reset(self): 368 369 """ Flushes and resets the codec buffers used for keeping state. 370 371 Calling this method should ensure that the data on the 372 output is put into a clean state, that allows appending 373 of new fresh data without having to rescan the whole 374 stream to recover state. 375 376 """ 377 pass 378 379 def seek(self, offset, whence=0): 380 self.stream.seek(offset, whence) 381 if whence == 0 and offset == 0: 382 self.reset() 383 384 def __getattr__(self, name, 385 getattr=getattr): 386 387 """ Inherit all other methods from the underlying stream. 388 """ 389 return getattr(self.stream, name) 390 391 def __enter__(self): 392 return self 393 394 def __exit__(self, type, value, tb): 395 self.stream.close() 396 397 ### 398 399 class StreamReader(Codec): 400 401 def __init__(self, stream, errors='strict'): 402 403 """ Creates a StreamReader instance. 404 405 stream must be a file-like object open for reading 406 (binary) data. 407 408 The StreamReader may use different error handling 409 schemes by providing the errors keyword argument. These 410 parameters are predefined: 411 412 'strict' - raise a ValueError (or a subclass) 413 'ignore' - ignore the character and continue with the next 414 'replace'- replace with a suitable replacement character; 415 416 The set of allowed parameter values can be extended via 417 register_error. 418 """ 419 self.stream = stream 420 self.errors = errors 421 self.bytebuffer = "" 422 # For str->str decoding this will stay a str 423 # For str->unicode decoding the first read will promote it to unicode 424 self.charbuffer = "" 425 self.linebuffer = None 426 427 def decode(self, input, errors='strict'): 428 raise NotImplementedError 429 430 def read(self, size=-1, chars=-1, firstline=False): 431 432 """ Decodes data from the stream self.stream and returns the 433 resulting object. 434 435 chars indicates the number of characters to read from the 436 stream. read() will never return more than chars 437 characters, but it might return less, if there are not enough 438 characters available. 439 440 size indicates the approximate maximum number of bytes to 441 read from the stream for decoding purposes. The decoder 442 can modify this setting as appropriate. The default value 443 -1 indicates to read and decode as much as possible. size 444 is intended to prevent having to decode huge files in one 445 step. 446 447 If firstline is true, and a UnicodeDecodeError happens 448 after the first line terminator in the input only the first line 449 will be returned, the rest of the input will be kept until the 450 next call to read(). 451 452 The method should use a greedy read strategy meaning that 453 it should read as much data as is allowed within the 454 definition of the encoding and the given size, e.g. if 455 optional encoding endings or state markers are available 456 on the stream, these should be read too. 457 """ 458 # If we have lines cached, first merge them back into characters 459 if self.linebuffer: 460 self.charbuffer = "".join(self.linebuffer) 461 self.linebuffer = None 462 463 # read until we get the required number of characters (if available) 464 while True: 465 # can the request be satisfied from the character buffer? 466 if chars >= 0: 467 if len(self.charbuffer) >= chars: 468 break 469 elif size >= 0: 470 if len(self.charbuffer) >= size: 471 break 472 # we need more data 473 if size < 0: 474 newdata = self.stream.read() 475 else: 476 newdata = self.stream.read(size) 477 # decode bytes (those remaining from the last call included) 478 data = self.bytebuffer + newdata 479 try: 480 newchars, decodedbytes = self.decode(data, self.errors) 481 except UnicodeDecodeError, exc: 482 if firstline: 483 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) 484 lines = newchars.splitlines(True) 485 if len(lines)<=1: 486 raise 487 else: 488 raise 489 # keep undecoded bytes until the next call 490 self.bytebuffer = data[decodedbytes:] 491 # put new characters in the character buffer 492 self.charbuffer += newchars 493 # there was no data available 494 if not newdata: 495 break 496 if chars < 0: 497 # Return everything we've got 498 result = self.charbuffer 499 self.charbuffer = "" 500 else: 501 # Return the first chars characters 502 result = self.charbuffer[:chars] 503 self.charbuffer = self.charbuffer[chars:] 504 return result 505 506 def readline(self, size=None, keepends=True): 507 508 """ Read one line from the input stream and return the 509 decoded data. 510 511 size, if given, is passed as size argument to the 512 read() method. 513 514 """ 515 # If we have lines cached from an earlier read, return 516 # them unconditionally 517 if self.linebuffer: 518 line = self.linebuffer[0] 519 del self.linebuffer[0] 520 if len(self.linebuffer) == 1: 521 # revert to charbuffer mode; we might need more data 522 # next time 523 self.charbuffer = self.linebuffer[0] 524 self.linebuffer = None 525 if not keepends: 526 line = line.splitlines(False)[0] 527 return line 528 529 readsize = size or 72 530 line = "" 531 # If size is given, we call read() only once 532 while True: 533 data = self.read(readsize, firstline=True) 534 if data: 535 # If we're at a "\r" read one extra character (which might 536 # be a "\n") to get a proper line ending. If the stream is 537 # temporarily exhausted we return the wrong line ending. 538 if data.endswith("\r"): 539 data += self.read(size=1, chars=1) 540 541 line += data 542 lines = line.splitlines(True) 543 if lines: 544 if len(lines) > 1: 545 # More than one line result; the first line is a full line 546 # to return 547 line = lines[0] 548 del lines[0] 549 if len(lines) > 1: 550 # cache the remaining lines 551 lines[-1] += self.charbuffer 552 self.linebuffer = lines 553 self.charbuffer = None 554 else: 555 # only one remaining line, put it back into charbuffer 556 self.charbuffer = lines[0] + self.charbuffer 557 if not keepends: 558 line = line.splitlines(False)[0] 559 break 560 line0withend = lines[0] 561 line0withoutend = lines[0].splitlines(False)[0] 562 if line0withend != line0withoutend: # We really have a line end 563 # Put the rest back together and keep it until the next call 564 self.charbuffer = "".join(lines[1:]) + self.charbuffer 565 if keepends: 566 line = line0withend 567 else: 568 line = line0withoutend 569 break 570 # we didn't get anything or this was our only try 571 if not data or size is not None: 572 if line and not keepends: 573 line = line.splitlines(False)[0] 574 break 575 if readsize<8000: 576 readsize *= 2 577 return line 578 579 def readlines(self, sizehint=None, keepends=True): 580 581 """ Read all lines available on the input stream 582 and return them as list of lines. 583 584 Line breaks are implemented using the codec's decoder 585 method and are included in the list entries. 586 587 sizehint, if given, is ignored since there is no efficient 588 way to finding the true end-of-line. 589 590 """ 591 data = self.read() 592 return data.splitlines(keepends) 593 594 def reset(self): 595 596 """ Resets the codec buffers used for keeping state. 597 598 Note that no stream repositioning should take place. 599 This method is primarily intended to be able to recover 600 from decoding errors. 601 602 """ 603 self.bytebuffer = "" 604 self.charbuffer = u"" 605 self.linebuffer = None 606 607 def seek(self, offset, whence=0): 608 """ Set the input stream's current position. 609 610 Resets the codec buffers used for keeping state. 611 """ 612 self.stream.seek(offset, whence) 613 self.reset() 614 615 def next(self): 616 617 """ Return the next decoded line from the input stream.""" 618 line = self.readline() 619 if line: 620 return line 621 raise StopIteration 622 623 def __iter__(self): 624 return self 625 626 def __getattr__(self, name, 627 getattr=getattr): 628 629 """ Inherit all other methods from the underlying stream. 630 """ 631 return getattr(self.stream, name) 632 633 def __enter__(self): 634 return self 635 636 def __exit__(self, type, value, tb): 637 self.stream.close() 638 639 ### 640 641 class StreamReaderWriter: 642 643 """ StreamReaderWriter instances allow wrapping streams which 644 work in both read and write modes. 645 646 The design is such that one can use the factory functions 647 returned by the codec.lookup() function to construct the 648 instance. 649 650 """ 651 # Optional attributes set by the file wrappers below 652 encoding = 'unknown' 653 654 def __init__(self, stream, Reader, Writer, errors='strict'): 655 656 """ Creates a StreamReaderWriter instance. 657 658 stream must be a Stream-like object. 659 660 Reader, Writer must be factory functions or classes 661 providing the StreamReader, StreamWriter interface resp. 662 663 Error handling is done in the same way as defined for the 664 StreamWriter/Readers. 665 666 """ 667 self.stream = stream 668 self.reader = Reader(stream, errors) 669 self.writer = Writer(stream, errors) 670 self.errors = errors 671 672 def read(self, size=-1): 673 674 return self.reader.read(size) 675 676 def readline(self, size=None): 677 678 return self.reader.readline(size) 679 680 def readlines(self, sizehint=None): 681 682 return self.reader.readlines(sizehint) 683 684 def next(self): 685 686 """ Return the next decoded line from the input stream.""" 687 return self.reader.next() 688 689 def __iter__(self): 690 return self 691 692 def write(self, data): 693 694 return self.writer.write(data) 695 696 def writelines(self, list): 697 698 return self.writer.writelines(list) 699 700 def reset(self): 701 702 self.reader.reset() 703 self.writer.reset() 704 705 def seek(self, offset, whence=0): 706 self.stream.seek(offset, whence) 707 self.reader.reset() 708 if whence == 0 and offset == 0: 709 self.writer.reset() 710 711 def __getattr__(self, name, 712 getattr=getattr): 713 714 """ Inherit all other methods from the underlying stream. 715 """ 716 return getattr(self.stream, name) 717 718 # these are needed to make "with codecs.open(...)" work properly 719 720 def __enter__(self): 721 return self 722 723 def __exit__(self, type, value, tb): 724 self.stream.close() 725 726 ### 727 728 class StreamRecoder: 729 730 """ StreamRecoder instances provide a frontend - backend 731 view of encoding data. 732 733 They use the complete set of APIs returned by the 734 codecs.lookup() function to implement their task. 735 736 Data written to the stream is first decoded into an 737 intermediate format (which is dependent on the given codec 738 combination) and then written to the stream using an instance 739 of the provided Writer class. 740 741 In the other direction, data is read from the stream using a 742 Reader instance and then return encoded data to the caller. 743 744 """ 745 # Optional attributes set by the file wrappers below 746 data_encoding = 'unknown' 747 file_encoding = 'unknown' 748 749 def __init__(self, stream, encode, decode, Reader, Writer, 750 errors='strict'): 751 752 """ Creates a StreamRecoder instance which implements a two-way 753 conversion: encode and decode work on the frontend (the 754 input to .read() and output of .write()) while 755 Reader and Writer work on the backend (reading and 756 writing to the stream). 757 758 You can use these objects to do transparent direct 759 recodings from e.g. latin-1 to utf-8 and back. 760 761 stream must be a file-like object. 762 763 encode, decode must adhere to the Codec interface, Reader, 764 Writer must be factory functions or classes providing the 765 StreamReader, StreamWriter interface resp. 766 767 encode and decode are needed for the frontend translation, 768 Reader and Writer for the backend translation. Unicode is 769 used as intermediate encoding. 770 771 Error handling is done in the same way as defined for the 772 StreamWriter/Readers. 773 774 """ 775 self.stream = stream 776 self.encode = encode 777 self.decode = decode 778 self.reader = Reader(stream, errors) 779 self.writer = Writer(stream, errors) 780 self.errors = errors 781 782 def read(self, size=-1): 783 784 data = self.reader.read(size) 785 data, bytesencoded = self.encode(data, self.errors) 786 return data 787 788 def readline(self, size=None): 789 790 if size is None: 791 data = self.reader.readline() 792 else: 793 data = self.reader.readline(size) 794 data, bytesencoded = self.encode(data, self.errors) 795 return data 796 797 def readlines(self, sizehint=None): 798 799 data = self.reader.read() 800 data, bytesencoded = self.encode(data, self.errors) 801 return data.splitlines(1) 802 803 def next(self): 804 805 """ Return the next decoded line from the input stream.""" 806 data = self.reader.next() 807 data, bytesencoded = self.encode(data, self.errors) 808 return data 809 810 def __iter__(self): 811 return self 812 813 def write(self, data): 814 815 data, bytesdecoded = self.decode(data, self.errors) 816 return self.writer.write(data) 817 818 def writelines(self, list): 819 820 data = ''.join(list) 821 data, bytesdecoded = self.decode(data, self.errors) 822 return self.writer.write(data) 823 824 def reset(self): 825 826 self.reader.reset() 827 self.writer.reset() 828 829 def __getattr__(self, name, 830 getattr=getattr): 831 832 """ Inherit all other methods from the underlying stream. 833 """ 834 return getattr(self.stream, name) 835 836 def __enter__(self): 837 return self 838 839 def __exit__(self, type, value, tb): 840 self.stream.close() 841 842 ### Shortcuts 843 844 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 845 846 """ Open an encoded file using the given mode and return 847 a wrapped version providing transparent encoding/decoding. 848 849 Note: The wrapped version will only accept the object format 850 defined by the codecs, i.e. Unicode objects for most builtin 851 codecs. Output is also codec dependent and will usually be 852 Unicode as well. 853 854 Files are always opened in binary mode, even if no binary mode 855 was specified. This is done to avoid data loss due to encodings 856 using 8-bit values. The default file mode is 'rb' meaning to 857 open the file in binary read mode. 858 859 encoding specifies the encoding which is to be used for the 860 file. 861 862 errors may be given to define the error handling. It defaults 863 to 'strict' which causes ValueErrors to be raised in case an 864 encoding error occurs. 865 866 buffering has the same meaning as for the builtin open() API. 867 It defaults to line buffered. 868 869 The returned wrapped file object provides an extra attribute 870 .encoding which allows querying the used encoding. This 871 attribute is only available if an encoding was specified as 872 parameter. 873 874 """ 875 if encoding is not None: 876 if 'U' in mode: 877 # No automatic conversion of '\n' is done on reading and writing 878 mode = mode.strip().replace('U', '') 879 if mode[:1] not in set('rwa'): 880 mode = 'r' + mode 881 if 'b' not in mode: 882 # Force opening of the file in binary mode 883 mode = mode + 'b' 884 file = __builtin__.open(filename, mode, buffering) 885 if encoding is None: 886 return file 887 info = lookup(encoding) 888 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 889 # Add attributes to simplify introspection 890 srw.encoding = encoding 891 return srw 892 893 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 894 895 """ Return a wrapped version of file which provides transparent 896 encoding translation. 897 898 Strings written to the wrapped file are interpreted according 899 to the given data_encoding and then written to the original 900 file as string using file_encoding. The intermediate encoding 901 will usually be Unicode but depends on the specified codecs. 902 903 Strings are read from the file using file_encoding and then 904 passed back to the caller as string using data_encoding. 905 906 If file_encoding is not given, it defaults to data_encoding. 907 908 errors may be given to define the error handling. It defaults 909 to 'strict' which causes ValueErrors to be raised in case an 910 encoding error occurs. 911 912 The returned wrapped file object provides two extra attributes 913 .data_encoding and .file_encoding which reflect the given 914 parameters of the same name. The attributes can be used for 915 introspection by Python programs. 916 917 """ 918 if file_encoding is None: 919 file_encoding = data_encoding 920 data_info = lookup(data_encoding) 921 file_info = lookup(file_encoding) 922 sr = StreamRecoder(file, data_info.encode, data_info.decode, 923 file_info.streamreader, file_info.streamwriter, errors) 924 # Add attributes to simplify introspection 925 sr.data_encoding = data_encoding 926 sr.file_encoding = file_encoding 927 return sr 928 929 ### Helpers for codec lookup 930 931 def getencoder(encoding): 932 933 """ Lookup up the codec for the given encoding and return 934 its encoder function. 935 936 Raises a LookupError in case the encoding cannot be found. 937 938 """ 939 return lookup(encoding).encode 940 941 def getdecoder(encoding): 942 943 """ Lookup up the codec for the given encoding and return 944 its decoder function. 945 946 Raises a LookupError in case the encoding cannot be found. 947 948 """ 949 return lookup(encoding).decode 950 951 def getincrementalencoder(encoding): 952 953 """ Lookup up the codec for the given encoding and return 954 its IncrementalEncoder class or factory function. 955 956 Raises a LookupError in case the encoding cannot be found 957 or the codecs doesn't provide an incremental encoder. 958 959 """ 960 encoder = lookup(encoding).incrementalencoder 961 if encoder is None: 962 raise LookupError(encoding) 963 return encoder 964 965 def getincrementaldecoder(encoding): 966 967 """ Lookup up the codec for the given encoding and return 968 its IncrementalDecoder class or factory function. 969 970 Raises a LookupError in case the encoding cannot be found 971 or the codecs doesn't provide an incremental decoder. 972 973 """ 974 decoder = lookup(encoding).incrementaldecoder 975 if decoder is None: 976 raise LookupError(encoding) 977 return decoder 978 979 def getreader(encoding): 980 981 """ Lookup up the codec for the given encoding and return 982 its StreamReader class or factory function. 983 984 Raises a LookupError in case the encoding cannot be found. 985 986 """ 987 return lookup(encoding).streamreader 988 989 def getwriter(encoding): 990 991 """ Lookup up the codec for the given encoding and return 992 its StreamWriter class or factory function. 993 994 Raises a LookupError in case the encoding cannot be found. 995 996 """ 997 return lookup(encoding).streamwriter 998 999 def iterencode(iterator, encoding, errors='strict', **kwargs): 1000 """ 1001 Encoding iterator. 1002 1003 Encodes the input strings from the iterator using a IncrementalEncoder. 1004 1005 errors and kwargs are passed through to the IncrementalEncoder 1006 constructor. 1007 """ 1008 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1009 for input in iterator: 1010 output = encoder.encode(input) 1011 if output: 1012 yield output 1013 output = encoder.encode("", True) 1014 if output: 1015 yield output 1016 1017 def iterdecode(iterator, encoding, errors='strict', **kwargs): 1018 """ 1019 Decoding iterator. 1020 1021 Decodes the input strings from the iterator using a IncrementalDecoder. 1022 1023 errors and kwargs are passed through to the IncrementalDecoder 1024 constructor. 1025 """ 1026 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1027 for input in iterator: 1028 output = decoder.decode(input) 1029 if output: 1030 yield output 1031 output = decoder.decode("", True) 1032 if output: 1033 yield output 1034 1035 ### Helpers for charmap-based codecs 1036 1037 def make_identity_dict(rng): 1038 1039 """ make_identity_dict(rng) -> dict 1040 1041 Return a dictionary where elements of the rng sequence are 1042 mapped to themselves. 1043 1044 """ 1045 res = {} 1046 for i in rng: 1047 res[i]=i 1048 return res 1049 1050 def make_encoding_map(decoding_map): 1051 1052 """ Creates an encoding map from a decoding map. 1053 1054 If a target mapping in the decoding map occurs multiple 1055 times, then that target is mapped to None (undefined mapping), 1056 causing an exception when encountered by the charmap codec 1057 during translation. 1058 1059 One example where this happens is cp875.py which decodes 1060 multiple character to \\u001a. 1061 1062 """ 1063 m = {} 1064 for k,v in decoding_map.items(): 1065 if not v in m: 1066 m[v] = k 1067 else: 1068 m[v] = None 1069 return m 1070 1071 ### error handlers 1072 1073 try: 1074 strict_errors = lookup_error("strict") 1075 ignore_errors = lookup_error("ignore") 1076 replace_errors = lookup_error("replace") 1077 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1078 backslashreplace_errors = lookup_error("backslashreplace") 1079 except LookupError: 1080 # In --disable-unicode builds, these error handler are missing 1081 strict_errors = None 1082 ignore_errors = None 1083 replace_errors = None 1084 xmlcharrefreplace_errors = None 1085 backslashreplace_errors = None 1086 1087 # Tell modulefinder that using codecs probably needs the encodings 1088 # package 1089 _false = 0 1090 if _false: 1091 import encodings 1092 1093 ### Tests 1094 1095 if __name__ == '__main__': 1096 1097 # Make stdout translate Latin-1 output into UTF-8 output 1098 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1099 1100 # Have stdin translate Latin-1 input into UTF-8 input 1101 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1102