Home | History | Annotate | Download | only in Lib
      1 """ codecs -- Python Codec Registry, API and helpers.
      2 
      3 
      4 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import builtins, sys
     11 
     12 ### Registry and builtin stateless codec functions
     13 
     14 try:
     15     from _codecs import *
     16 except ImportError as why:
     17     raise SystemError('Failed to load the builtin codecs: %s' % why)
     18 
     19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
     20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
     21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
     22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
     23            "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
     24            "StreamReader", "StreamWriter",
     25            "StreamReaderWriter", "StreamRecoder",
     26            "getencoder", "getdecoder", "getincrementalencoder",
     27            "getincrementaldecoder", "getreader", "getwriter",
     28            "encode", "decode", "iterencode", "iterdecode",
     29            "strict_errors", "ignore_errors", "replace_errors",
     30            "xmlcharrefreplace_errors",
     31            "backslashreplace_errors", "namereplace_errors",
     32            "register_error", "lookup_error"]
     33 
     34 ### Constants
     35 
     36 #
     37 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
     38 # and its possible byte string values
     39 # for UTF8/UTF16/UTF32 output and little/big endian machines
     40 #
     41 
     42 # UTF-8
     43 BOM_UTF8 = b'\xef\xbb\xbf'
     44 
     45 # UTF-16, little endian
     46 BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
     47 
     48 # UTF-16, big endian
     49 BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
     50 
     51 # UTF-32, little endian
     52 BOM_UTF32_LE = b'\xff\xfe\x00\x00'
     53 
     54 # UTF-32, big endian
     55 BOM_UTF32_BE = b'\x00\x00\xfe\xff'
     56 
     57 if sys.byteorder == 'little':
     58 
     59     # UTF-16, native endianness
     60     BOM = BOM_UTF16 = BOM_UTF16_LE
     61 
     62     # UTF-32, native endianness
     63     BOM_UTF32 = BOM_UTF32_LE
     64 
     65 else:
     66 
     67     # UTF-16, native endianness
     68     BOM = BOM_UTF16 = BOM_UTF16_BE
     69 
     70     # UTF-32, native endianness
     71     BOM_UTF32 = BOM_UTF32_BE
     72 
     73 # Old broken names (don't use in new code)
     74 BOM32_LE = BOM_UTF16_LE
     75 BOM32_BE = BOM_UTF16_BE
     76 BOM64_LE = BOM_UTF32_LE
     77 BOM64_BE = BOM_UTF32_BE
     78 
     79 
     80 ### Codec base classes (defining the API)
     81 
     82 class CodecInfo(tuple):
     83     """Codec details when looking up the codec registry"""
     84 
     85     # Private API to allow Python 3.4 to blacklist the known non-Unicode
     86     # codecs in the standard library. A more general mechanism to
     87     # reliably distinguish test encodings from other codecs will hopefully
     88     # be defined for Python 3.5
     89     #
     90     # See http://bugs.python.org/issue19619
     91     _is_text_encoding = True # Assume codecs are text encodings by default
     92 
     93     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
     94         incrementalencoder=None, incrementaldecoder=None, name=None,
     95         *, _is_text_encoding=None):
     96         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
     97         self.name = name
     98         self.encode = encode
     99         self.decode = decode
    100         self.incrementalencoder = incrementalencoder
    101         self.incrementaldecoder = incrementaldecoder
    102         self.streamwriter = streamwriter
    103         self.streamreader = streamreader
    104         if _is_text_encoding is not None:
    105             self._is_text_encoding = _is_text_encoding
    106         return self
    107 
    108     def __repr__(self):
    109         return "<%s.%s object for encoding %s at %#x>" % \
    110                 (self.__class__.__module__, self.__class__.__qualname__,
    111                  self.name, id(self))
    112 
    113 class Codec:
    114 
    115     """ Defines the interface for stateless encoders/decoders.
    116 
    117         The .encode()/.decode() methods may use different error
    118         handling schemes by providing the errors argument. These
    119         string values are predefined:
    120 
    121          'strict' - raise a ValueError error (or a subclass)
    122          'ignore' - ignore the character and continue with the next
    123          'replace' - replace with a suitable replacement character;
    124                     Python will use the official U+FFFD REPLACEMENT
    125                     CHARACTER for the builtin Unicode codecs on
    126                     decoding and '?' on encoding.
    127          'surrogateescape' - replace with private code points U+DCnn.
    128          'xmlcharrefreplace' - Replace with the appropriate XML
    129                                character reference (only for encoding).
    130          'backslashreplace'  - Replace with backslashed escape sequences.
    131          'namereplace'       - Replace with \\N{...} escape sequences
    132                                (only for encoding).
    133 
    134         The set of allowed values can be extended via register_error.
    135 
    136     """
    137     def encode(self, input, errors='strict'):
    138 
    139         """ Encodes the object input and returns a tuple (output
    140             object, length consumed).
    141 
    142             errors defines the error handling to apply. It defaults to
    143             'strict' handling.
    144 
    145             The method may not store state in the Codec instance. Use
    146             StreamWriter for codecs which have to keep state in order to
    147             make encoding efficient.
    148 
    149             The encoder must be able to handle zero length input and
    150             return an empty object of the output object type in this
    151             situation.
    152 
    153         """
    154         raise NotImplementedError
    155 
    156     def decode(self, input, errors='strict'):
    157 
    158         """ Decodes the object input and returns a tuple (output
    159             object, length consumed).
    160 
    161             input must be an object which provides the bf_getreadbuf
    162             buffer slot. Python strings, buffer objects and memory
    163             mapped files are examples of objects providing this slot.
    164 
    165             errors defines the error handling to apply. It defaults to
    166             'strict' handling.
    167 
    168             The method may not store state in the Codec instance. Use
    169             StreamReader for codecs which have to keep state in order to
    170             make decoding efficient.
    171 
    172             The decoder must be able to handle zero length input and
    173             return an empty object of the output object type in this
    174             situation.
    175 
    176         """
    177         raise NotImplementedError
    178 
    179 class IncrementalEncoder(object):
    180     """
    181     An IncrementalEncoder encodes an input in multiple steps. The input can
    182     be passed piece by piece to the encode() method. The IncrementalEncoder
    183     remembers the state of the encoding process between calls to encode().
    184     """
    185     def __init__(self, errors='strict'):
    186         """
    187         Creates an IncrementalEncoder instance.
    188 
    189         The IncrementalEncoder may use different error handling schemes by
    190         providing the errors keyword argument. See the module docstring
    191         for a list of possible values.
    192         """
    193         self.errors = errors
    194         self.buffer = ""
    195 
    196     def encode(self, input, final=False):
    197         """
    198         Encodes input and returns the resulting object.
    199         """
    200         raise NotImplementedError
    201 
    202     def reset(self):
    203         """
    204         Resets the encoder to the initial state.
    205         """
    206 
    207     def getstate(self):
    208         """
    209         Return the current state of the encoder.
    210         """
    211         return 0
    212 
    213     def setstate(self, state):
    214         """
    215         Set the current state of the encoder. state must have been
    216         returned by getstate().
    217         """
    218 
    219 class BufferedIncrementalEncoder(IncrementalEncoder):
    220     """
    221     This subclass of IncrementalEncoder can be used as the baseclass for an
    222     incremental encoder if the encoder must keep some of the output in a
    223     buffer between calls to encode().
    224     """
    225     def __init__(self, errors='strict'):
    226         IncrementalEncoder.__init__(self, errors)
    227         # unencoded input that is kept between calls to encode()
    228         self.buffer = ""
    229 
    230     def _buffer_encode(self, input, errors, final):
    231         # Overwrite this method in subclasses: It must encode input
    232         # and return an (output, length consumed) tuple
    233         raise NotImplementedError
    234 
    235     def encode(self, input, final=False):
    236         # encode input (taking the buffer into account)
    237         data = self.buffer + input
    238         (result, consumed) = self._buffer_encode(data, self.errors, final)
    239         # keep unencoded input until the next call
    240         self.buffer = data[consumed:]
    241         return result
    242 
    243     def reset(self):
    244         IncrementalEncoder.reset(self)
    245         self.buffer = ""
    246 
    247     def getstate(self):
    248         return self.buffer or 0
    249 
    250     def setstate(self, state):
    251         self.buffer = state or ""
    252 
    253 class IncrementalDecoder(object):
    254     """
    255     An IncrementalDecoder decodes an input in multiple steps. The input can
    256     be passed piece by piece to the decode() method. The IncrementalDecoder
    257     remembers the state of the decoding process between calls to decode().
    258     """
    259     def __init__(self, errors='strict'):
    260         """
    261         Create an IncrementalDecoder instance.
    262 
    263         The IncrementalDecoder may use different error handling schemes by
    264         providing the errors keyword argument. See the module docstring
    265         for a list of possible values.
    266         """
    267         self.errors = errors
    268 
    269     def decode(self, input, final=False):
    270         """
    271         Decode input and returns the resulting object.
    272         """
    273         raise NotImplementedError
    274 
    275     def reset(self):
    276         """
    277         Reset the decoder to the initial state.
    278         """
    279 
    280     def getstate(self):
    281         """
    282         Return the current state of the decoder.
    283 
    284         This must be a (buffered_input, additional_state_info) tuple.
    285         buffered_input must be a bytes object containing bytes that
    286         were passed to decode() that have not yet been converted.
    287         additional_state_info must be a non-negative integer
    288         representing the state of the decoder WITHOUT yet having
    289         processed the contents of buffered_input.  In the initial state
    290         and after reset(), getstate() must return (b"", 0).
    291         """
    292         return (b"", 0)
    293 
    294     def setstate(self, state):
    295         """
    296         Set the current state of the decoder.
    297 
    298         state must have been returned by getstate().  The effect of
    299         setstate((b"", 0)) must be equivalent to reset().
    300         """
    301 
    302 class BufferedIncrementalDecoder(IncrementalDecoder):
    303     """
    304     This subclass of IncrementalDecoder can be used as the baseclass for an
    305     incremental decoder if the decoder must be able to handle incomplete
    306     byte sequences.
    307     """
    308     def __init__(self, errors='strict'):
    309         IncrementalDecoder.__init__(self, errors)
    310         # undecoded input that is kept between calls to decode()
    311         self.buffer = b""
    312 
    313     def _buffer_decode(self, input, errors, final):
    314         # Overwrite this method in subclasses: It must decode input
    315         # and return an (output, length consumed) tuple
    316         raise NotImplementedError
    317 
    318     def decode(self, input, final=False):
    319         # decode input (taking the buffer into account)
    320         data = self.buffer + input
    321         (result, consumed) = self._buffer_decode(data, self.errors, final)
    322         # keep undecoded input until the next call
    323         self.buffer = data[consumed:]
    324         return result
    325 
    326     def reset(self):
    327         IncrementalDecoder.reset(self)
    328         self.buffer = b""
    329 
    330     def getstate(self):
    331         # additional state info is always 0
    332         return (self.buffer, 0)
    333 
    334     def setstate(self, state):
    335         # ignore additional state info
    336         self.buffer = state[0]
    337 
    338 #
    339 # The StreamWriter and StreamReader class provide generic working
    340 # interfaces which can be used to implement new encoding submodules
    341 # very easily. See encodings/utf_8.py for an example on how this is
    342 # done.
    343 #
    344 
    345 class StreamWriter(Codec):
    346 
    347     def __init__(self, stream, errors='strict'):
    348 
    349         """ Creates a StreamWriter instance.
    350 
    351             stream must be a file-like object open for writing.
    352 
    353             The StreamWriter may use different error handling
    354             schemes by providing the errors keyword argument. These
    355             parameters are predefined:
    356 
    357              'strict' - raise a ValueError (or a subclass)
    358              'ignore' - ignore the character and continue with the next
    359              'replace'- replace with a suitable replacement character
    360              'xmlcharrefreplace' - Replace with the appropriate XML
    361                                    character reference.
    362              'backslashreplace'  - Replace with backslashed escape
    363                                    sequences.
    364              'namereplace'       - Replace with \\N{...} escape sequences.
    365 
    366             The set of allowed parameter values can be extended via
    367             register_error.
    368         """
    369         self.stream = stream
    370         self.errors = errors
    371 
    372     def write(self, object):
    373 
    374         """ Writes the object's contents encoded to self.stream.
    375         """
    376         data, consumed = self.encode(object, self.errors)
    377         self.stream.write(data)
    378 
    379     def writelines(self, list):
    380 
    381         """ Writes the concatenated list of strings to the stream
    382             using .write().
    383         """
    384         self.write(''.join(list))
    385 
    386     def reset(self):
    387 
    388         """ Flushes and resets the codec buffers used for keeping state.
    389 
    390             Calling this method should ensure that the data on the
    391             output is put into a clean state, that allows appending
    392             of new fresh data without having to rescan the whole
    393             stream to recover state.
    394 
    395         """
    396         pass
    397 
    398     def seek(self, offset, whence=0):
    399         self.stream.seek(offset, whence)
    400         if whence == 0 and offset == 0:
    401             self.reset()
    402 
    403     def __getattr__(self, name,
    404                     getattr=getattr):
    405 
    406         """ Inherit all other methods from the underlying stream.
    407         """
    408         return getattr(self.stream, name)
    409 
    410     def __enter__(self):
    411         return self
    412 
    413     def __exit__(self, type, value, tb):
    414         self.stream.close()
    415 
    416 ###
    417 
    418 class StreamReader(Codec):
    419 
    420     charbuffertype = str
    421 
    422     def __init__(self, stream, errors='strict'):
    423 
    424         """ Creates a StreamReader instance.
    425 
    426             stream must be a file-like object open for reading.
    427 
    428             The StreamReader may use different error handling
    429             schemes by providing the errors keyword argument. These
    430             parameters are predefined:
    431 
    432              'strict' - raise a ValueError (or a subclass)
    433              'ignore' - ignore the character and continue with the next
    434              'replace'- replace with a suitable replacement character
    435              'backslashreplace' - Replace with backslashed escape sequences;
    436 
    437             The set of allowed parameter values can be extended via
    438             register_error.
    439         """
    440         self.stream = stream
    441         self.errors = errors
    442         self.bytebuffer = b""
    443         self._empty_charbuffer = self.charbuffertype()
    444         self.charbuffer = self._empty_charbuffer
    445         self.linebuffer = None
    446 
    447     def decode(self, input, errors='strict'):
    448         raise NotImplementedError
    449 
    450     def read(self, size=-1, chars=-1, firstline=False):
    451 
    452         """ Decodes data from the stream self.stream and returns the
    453             resulting object.
    454 
    455             chars indicates the number of decoded code points or bytes to
    456             return. read() will never return more data than requested,
    457             but it might return less, if there is not enough available.
    458 
    459             size indicates the approximate maximum number of decoded
    460             bytes or code points to read for decoding. The decoder
    461             can modify this setting as appropriate. The default value
    462             -1 indicates to read and decode as much as possible.  size
    463             is intended to prevent having to decode huge files in one
    464             step.
    465 
    466             If firstline is true, and a UnicodeDecodeError happens
    467             after the first line terminator in the input only the first line
    468             will be returned, the rest of the input will be kept until the
    469             next call to read().
    470 
    471             The method should use a greedy read strategy, meaning that
    472             it should read as much data as is allowed within the
    473             definition of the encoding and the given size, e.g.  if
    474             optional encoding endings or state markers are available
    475             on the stream, these should be read too.
    476         """
    477         # If we have lines cached, first merge them back into characters
    478         if self.linebuffer:
    479             self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
    480             self.linebuffer = None
    481 
    482         # read until we get the required number of characters (if available)
    483         while True:
    484             # can the request be satisfied from the character buffer?
    485             if chars >= 0:
    486                 if len(self.charbuffer) >= chars:
    487                     break
    488             elif size >= 0:
    489                 if len(self.charbuffer) >= size:
    490                     break
    491             # we need more data
    492             if size < 0:
    493                 newdata = self.stream.read()
    494             else:
    495                 newdata = self.stream.read(size)
    496             # decode bytes (those remaining from the last call included)
    497             data = self.bytebuffer + newdata
    498             if not data:
    499                 break
    500             try:
    501                 newchars, decodedbytes = self.decode(data, self.errors)
    502             except UnicodeDecodeError as exc:
    503                 if firstline:
    504                     newchars, decodedbytes = \
    505                         self.decode(data[:exc.start], self.errors)
    506                     lines = newchars.splitlines(keepends=True)
    507                     if len(lines)<=1:
    508                         raise
    509                 else:
    510                     raise
    511             # keep undecoded bytes until the next call
    512             self.bytebuffer = data[decodedbytes:]
    513             # put new characters in the character buffer
    514             self.charbuffer += newchars
    515             # there was no data available
    516             if not newdata:
    517                 break
    518         if chars < 0:
    519             # Return everything we've got
    520             result = self.charbuffer
    521             self.charbuffer = self._empty_charbuffer
    522         else:
    523             # Return the first chars characters
    524             result = self.charbuffer[:chars]
    525             self.charbuffer = self.charbuffer[chars:]
    526         return result
    527 
    528     def readline(self, size=None, keepends=True):
    529 
    530         """ Read one line from the input stream and return the
    531             decoded data.
    532 
    533             size, if given, is passed as size argument to the
    534             read() method.
    535 
    536         """
    537         # If we have lines cached from an earlier read, return
    538         # them unconditionally
    539         if self.linebuffer:
    540             line = self.linebuffer[0]
    541             del self.linebuffer[0]
    542             if len(self.linebuffer) == 1:
    543                 # revert to charbuffer mode; we might need more data
    544                 # next time
    545                 self.charbuffer = self.linebuffer[0]
    546                 self.linebuffer = None
    547             if not keepends:
    548                 line = line.splitlines(keepends=False)[0]
    549             return line
    550 
    551         readsize = size or 72
    552         line = self._empty_charbuffer
    553         # If size is given, we call read() only once
    554         while True:
    555             data = self.read(readsize, firstline=True)
    556             if data:
    557                 # If we're at a "\r" read one extra character (which might
    558                 # be a "\n") to get a proper line ending. If the stream is
    559                 # temporarily exhausted we return the wrong line ending.
    560                 if (isinstance(data, str) and data.endswith("\r")) or \
    561                    (isinstance(data, bytes) and data.endswith(b"\r")):
    562                     data += self.read(size=1, chars=1)
    563 
    564             line += data
    565             lines = line.splitlines(keepends=True)
    566             if lines:
    567                 if len(lines) > 1:
    568                     # More than one line result; the first line is a full line
    569                     # to return
    570                     line = lines[0]
    571                     del lines[0]
    572                     if len(lines) > 1:
    573                         # cache the remaining lines
    574                         lines[-1] += self.charbuffer
    575                         self.linebuffer = lines
    576                         self.charbuffer = None
    577                     else:
    578                         # only one remaining line, put it back into charbuffer
    579                         self.charbuffer = lines[0] + self.charbuffer
    580                     if not keepends:
    581                         line = line.splitlines(keepends=False)[0]
    582                     break
    583                 line0withend = lines[0]
    584                 line0withoutend = lines[0].splitlines(keepends=False)[0]
    585                 if line0withend != line0withoutend: # We really have a line end
    586                     # Put the rest back together and keep it until the next call
    587                     self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
    588                                       self.charbuffer
    589                     if keepends:
    590                         line = line0withend
    591                     else:
    592                         line = line0withoutend
    593                     break
    594             # we didn't get anything or this was our only try
    595             if not data or size is not None:
    596                 if line and not keepends:
    597                     line = line.splitlines(keepends=False)[0]
    598                 break
    599             if readsize < 8000:
    600                 readsize *= 2
    601         return line
    602 
    603     def readlines(self, sizehint=None, keepends=True):
    604 
    605         """ Read all lines available on the input stream
    606             and return them as a list.
    607 
    608             Line breaks are implemented using the codec's decoder
    609             method and are included in the list entries.
    610 
    611             sizehint, if given, is ignored since there is no efficient
    612             way to finding the true end-of-line.
    613 
    614         """
    615         data = self.read()
    616         return data.splitlines(keepends)
    617 
    618     def reset(self):
    619 
    620         """ Resets the codec buffers used for keeping state.
    621 
    622             Note that no stream repositioning should take place.
    623             This method is primarily intended to be able to recover
    624             from decoding errors.
    625 
    626         """
    627         self.bytebuffer = b""
    628         self.charbuffer = self._empty_charbuffer
    629         self.linebuffer = None
    630 
    631     def seek(self, offset, whence=0):
    632         """ Set the input stream's current position.
    633 
    634             Resets the codec buffers used for keeping state.
    635         """
    636         self.stream.seek(offset, whence)
    637         self.reset()
    638 
    639     def __next__(self):
    640 
    641         """ Return the next decoded line from the input stream."""
    642         line = self.readline()
    643         if line:
    644             return line
    645         raise StopIteration
    646 
    647     def __iter__(self):
    648         return self
    649 
    650     def __getattr__(self, name,
    651                     getattr=getattr):
    652 
    653         """ Inherit all other methods from the underlying stream.
    654         """
    655         return getattr(self.stream, name)
    656 
    657     def __enter__(self):
    658         return self
    659 
    660     def __exit__(self, type, value, tb):
    661         self.stream.close()
    662 
    663 ###
    664 
    665 class StreamReaderWriter:
    666 
    667     """ StreamReaderWriter instances allow wrapping streams which
    668         work in both read and write modes.
    669 
    670         The design is such that one can use the factory functions
    671         returned by the codec.lookup() function to construct the
    672         instance.
    673 
    674     """
    675     # Optional attributes set by the file wrappers below
    676     encoding = 'unknown'
    677 
    678     def __init__(self, stream, Reader, Writer, errors='strict'):
    679 
    680         """ Creates a StreamReaderWriter instance.
    681 
    682             stream must be a Stream-like object.
    683 
    684             Reader, Writer must be factory functions or classes
    685             providing the StreamReader, StreamWriter interface resp.
    686 
    687             Error handling is done in the same way as defined for the
    688             StreamWriter/Readers.
    689 
    690         """
    691         self.stream = stream
    692         self.reader = Reader(stream, errors)
    693         self.writer = Writer(stream, errors)
    694         self.errors = errors
    695 
    696     def read(self, size=-1):
    697 
    698         return self.reader.read(size)
    699 
    700     def readline(self, size=None):
    701 
    702         return self.reader.readline(size)
    703 
    704     def readlines(self, sizehint=None):
    705 
    706         return self.reader.readlines(sizehint)
    707 
    708     def __next__(self):
    709 
    710         """ Return the next decoded line from the input stream."""
    711         return next(self.reader)
    712 
    713     def __iter__(self):
    714         return self
    715 
    716     def write(self, data):
    717 
    718         return self.writer.write(data)
    719 
    720     def writelines(self, list):
    721 
    722         return self.writer.writelines(list)
    723 
    724     def reset(self):
    725 
    726         self.reader.reset()
    727         self.writer.reset()
    728 
    729     def seek(self, offset, whence=0):
    730         self.stream.seek(offset, whence)
    731         self.reader.reset()
    732         if whence == 0 and offset == 0:
    733             self.writer.reset()
    734 
    735     def __getattr__(self, name,
    736                     getattr=getattr):
    737 
    738         """ Inherit all other methods from the underlying stream.
    739         """
    740         return getattr(self.stream, name)
    741 
    742     # these are needed to make "with codecs.open(...)" work properly
    743 
    744     def __enter__(self):
    745         return self
    746 
    747     def __exit__(self, type, value, tb):
    748         self.stream.close()
    749 
    750 ###
    751 
    752 class StreamRecoder:
    753 
    754     """ StreamRecoder instances translate data from one encoding to another.
    755 
    756         They use the complete set of APIs returned by the
    757         codecs.lookup() function to implement their task.
    758 
    759         Data written to the StreamRecoder is first decoded into an
    760         intermediate format (depending on the "decode" codec) and then
    761         written to the underlying stream using an instance of the provided
    762         Writer class.
    763 
    764         In the other direction, data is read from the underlying stream using
    765         a Reader instance and then encoded and returned to the caller.
    766 
    767     """
    768     # Optional attributes set by the file wrappers below
    769     data_encoding = 'unknown'
    770     file_encoding = 'unknown'
    771 
    772     def __init__(self, stream, encode, decode, Reader, Writer,
    773                  errors='strict'):
    774 
    775         """ Creates a StreamRecoder instance which implements a two-way
    776             conversion: encode and decode work on the frontend (the
    777             data visible to .read() and .write()) while Reader and Writer
    778             work on the backend (the data in stream).
    779 
    780             You can use these objects to do transparent
    781             transcodings from e.g. latin-1 to utf-8 and back.
    782 
    783             stream must be a file-like object.
    784 
    785             encode and decode must adhere to the Codec interface; Reader and
    786             Writer must be factory functions or classes providing the
    787             StreamReader and StreamWriter interfaces resp.
    788 
    789             Error handling is done in the same way as defined for the
    790             StreamWriter/Readers.
    791 
    792         """
    793         self.stream = stream
    794         self.encode = encode
    795         self.decode = decode
    796         self.reader = Reader(stream, errors)
    797         self.writer = Writer(stream, errors)
    798         self.errors = errors
    799 
    800     def read(self, size=-1):
    801 
    802         data = self.reader.read(size)
    803         data, bytesencoded = self.encode(data, self.errors)
    804         return data
    805 
    806     def readline(self, size=None):
    807 
    808         if size is None:
    809             data = self.reader.readline()
    810         else:
    811             data = self.reader.readline(size)
    812         data, bytesencoded = self.encode(data, self.errors)
    813         return data
    814 
    815     def readlines(self, sizehint=None):
    816 
    817         data = self.reader.read()
    818         data, bytesencoded = self.encode(data, self.errors)
    819         return data.splitlines(keepends=True)
    820 
    821     def __next__(self):
    822 
    823         """ Return the next decoded line from the input stream."""
    824         data = next(self.reader)
    825         data, bytesencoded = self.encode(data, self.errors)
    826         return data
    827 
    828     def __iter__(self):
    829         return self
    830 
    831     def write(self, data):
    832 
    833         data, bytesdecoded = self.decode(data, self.errors)
    834         return self.writer.write(data)
    835 
    836     def writelines(self, list):
    837 
    838         data = ''.join(list)
    839         data, bytesdecoded = self.decode(data, self.errors)
    840         return self.writer.write(data)
    841 
    842     def reset(self):
    843 
    844         self.reader.reset()
    845         self.writer.reset()
    846 
    847     def __getattr__(self, name,
    848                     getattr=getattr):
    849 
    850         """ Inherit all other methods from the underlying stream.
    851         """
    852         return getattr(self.stream, name)
    853 
    854     def __enter__(self):
    855         return self
    856 
    857     def __exit__(self, type, value, tb):
    858         self.stream.close()
    859 
    860 ### Shortcuts
    861 
    862 def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
    863 
    864     """ Open an encoded file using the given mode and return
    865         a wrapped version providing transparent encoding/decoding.
    866 
    867         Note: The wrapped version will only accept the object format
    868         defined by the codecs, i.e. Unicode objects for most builtin
    869         codecs. Output is also codec dependent and will usually be
    870         Unicode as well.
    871 
    872         Underlying encoded files are always opened in binary mode.
    873         The default file mode is 'r', meaning to open the file in read mode.
    874 
    875         encoding specifies the encoding which is to be used for the
    876         file.
    877 
    878         errors may be given to define the error handling. It defaults
    879         to 'strict' which causes ValueErrors to be raised in case an
    880         encoding error occurs.
    881 
    882         buffering has the same meaning as for the builtin open() API.
    883         It defaults to line buffered.
    884 
    885         The returned wrapped file object provides an extra attribute
    886         .encoding which allows querying the used encoding. This
    887         attribute is only available if an encoding was specified as
    888         parameter.
    889 
    890     """
    891     if encoding is not None and \
    892        'b' not in mode:
    893         # Force opening of the file in binary mode
    894         mode = mode + 'b'
    895     file = builtins.open(filename, mode, buffering)
    896     if encoding is None:
    897         return file
    898     info = lookup(encoding)
    899     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    900     # Add attributes to simplify introspection
    901     srw.encoding = encoding
    902     return srw
    903 
    904 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    905 
    906     """ Return a wrapped version of file which provides transparent
    907         encoding translation.
    908 
    909         Data written to the wrapped file is decoded according
    910         to the given data_encoding and then encoded to the underlying
    911         file using file_encoding. The intermediate data type
    912         will usually be Unicode but depends on the specified codecs.
    913 
    914         Bytes read from the file are decoded using file_encoding and then
    915         passed back to the caller encoded using data_encoding.
    916 
    917         If file_encoding is not given, it defaults to data_encoding.
    918 
    919         errors may be given to define the error handling. It defaults
    920         to 'strict' which causes ValueErrors to be raised in case an
    921         encoding error occurs.
    922 
    923         The returned wrapped file object provides two extra attributes
    924         .data_encoding and .file_encoding which reflect the given
    925         parameters of the same name. The attributes can be used for
    926         introspection by Python programs.
    927 
    928     """
    929     if file_encoding is None:
    930         file_encoding = data_encoding
    931     data_info = lookup(data_encoding)
    932     file_info = lookup(file_encoding)
    933     sr = StreamRecoder(file, data_info.encode, data_info.decode,
    934                        file_info.streamreader, file_info.streamwriter, errors)
    935     # Add attributes to simplify introspection
    936     sr.data_encoding = data_encoding
    937     sr.file_encoding = file_encoding
    938     return sr
    939 
    940 ### Helpers for codec lookup
    941 
    942 def getencoder(encoding):
    943 
    944     """ Lookup up the codec for the given encoding and return
    945         its encoder function.
    946 
    947         Raises a LookupError in case the encoding cannot be found.
    948 
    949     """
    950     return lookup(encoding).encode
    951 
    952 def getdecoder(encoding):
    953 
    954     """ Lookup up the codec for the given encoding and return
    955         its decoder function.
    956 
    957         Raises a LookupError in case the encoding cannot be found.
    958 
    959     """
    960     return lookup(encoding).decode
    961 
    962 def getincrementalencoder(encoding):
    963 
    964     """ Lookup up the codec for the given encoding and return
    965         its IncrementalEncoder class or factory function.
    966 
    967         Raises a LookupError in case the encoding cannot be found
    968         or the codecs doesn't provide an incremental encoder.
    969 
    970     """
    971     encoder = lookup(encoding).incrementalencoder
    972     if encoder is None:
    973         raise LookupError(encoding)
    974     return encoder
    975 
    976 def getincrementaldecoder(encoding):
    977 
    978     """ Lookup up the codec for the given encoding and return
    979         its IncrementalDecoder class or factory function.
    980 
    981         Raises a LookupError in case the encoding cannot be found
    982         or the codecs doesn't provide an incremental decoder.
    983 
    984     """
    985     decoder = lookup(encoding).incrementaldecoder
    986     if decoder is None:
    987         raise LookupError(encoding)
    988     return decoder
    989 
    990 def getreader(encoding):
    991 
    992     """ Lookup up the codec for the given encoding and return
    993         its StreamReader class or factory function.
    994 
    995         Raises a LookupError in case the encoding cannot be found.
    996 
    997     """
    998     return lookup(encoding).streamreader
    999 
   1000 def getwriter(encoding):
   1001 
   1002     """ Lookup up the codec for the given encoding and return
   1003         its StreamWriter class or factory function.
   1004 
   1005         Raises a LookupError in case the encoding cannot be found.
   1006 
   1007     """
   1008     return lookup(encoding).streamwriter
   1009 
   1010 def iterencode(iterator, encoding, errors='strict', **kwargs):
   1011     """
   1012     Encoding iterator.
   1013 
   1014     Encodes the input strings from the iterator using an IncrementalEncoder.
   1015 
   1016     errors and kwargs are passed through to the IncrementalEncoder
   1017     constructor.
   1018     """
   1019     encoder = getincrementalencoder(encoding)(errors, **kwargs)
   1020     for input in iterator:
   1021         output = encoder.encode(input)
   1022         if output:
   1023             yield output
   1024     output = encoder.encode("", True)
   1025     if output:
   1026         yield output
   1027 
   1028 def iterdecode(iterator, encoding, errors='strict', **kwargs):
   1029     """
   1030     Decoding iterator.
   1031 
   1032     Decodes the input strings from the iterator using an IncrementalDecoder.
   1033 
   1034     errors and kwargs are passed through to the IncrementalDecoder
   1035     constructor.
   1036     """
   1037     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
   1038     for input in iterator:
   1039         output = decoder.decode(input)
   1040         if output:
   1041             yield output
   1042     output = decoder.decode(b"", True)
   1043     if output:
   1044         yield output
   1045 
   1046 ### Helpers for charmap-based codecs
   1047 
   1048 def make_identity_dict(rng):
   1049 
   1050     """ make_identity_dict(rng) -> dict
   1051 
   1052         Return a dictionary where elements of the rng sequence are
   1053         mapped to themselves.
   1054 
   1055     """
   1056     return {i:i for i in rng}
   1057 
   1058 def make_encoding_map(decoding_map):
   1059 
   1060     """ Creates an encoding map from a decoding map.
   1061 
   1062         If a target mapping in the decoding map occurs multiple
   1063         times, then that target is mapped to None (undefined mapping),
   1064         causing an exception when encountered by the charmap codec
   1065         during translation.
   1066 
   1067         One example where this happens is cp875.py which decodes
   1068         multiple character to \\u001a.
   1069 
   1070     """
   1071     m = {}
   1072     for k,v in decoding_map.items():
   1073         if not v in m:
   1074             m[v] = k
   1075         else:
   1076             m[v] = None
   1077     return m
   1078 
   1079 ### error handlers
   1080 
   1081 try:
   1082     strict_errors = lookup_error("strict")
   1083     ignore_errors = lookup_error("ignore")
   1084     replace_errors = lookup_error("replace")
   1085     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
   1086     backslashreplace_errors = lookup_error("backslashreplace")
   1087     namereplace_errors = lookup_error("namereplace")
   1088 except LookupError:
   1089     # In --disable-unicode builds, these error handler are missing
   1090     strict_errors = None
   1091     ignore_errors = None
   1092     replace_errors = None
   1093     xmlcharrefreplace_errors = None
   1094     backslashreplace_errors = None
   1095     namereplace_errors = None
   1096 
   1097 # Tell modulefinder that using codecs probably needs the encodings
   1098 # package
   1099 _false = 0
   1100 if _false:
   1101     import encodings
   1102 
   1103 ### Tests
   1104 
   1105 if __name__ == '__main__':
   1106 
   1107     # Make stdout translate Latin-1 output into UTF-8 output
   1108     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
   1109 
   1110     # Have stdin translate Latin-1 input into UTF-8 input
   1111     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
   1112