Home | History | Annotate | Download | only in Lib
      1 """ codecs -- Python Codec Registry, API and helpers.
      2 
      3 
      4 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import __builtin__, sys
     11 
     12 ### Registry and builtin stateless codec functions
     13 
     14 try:
     15     from _codecs import *
     16 except ImportError, why:
     17     raise SystemError('Failed to load the builtin codecs: %s' % why)
     18 
     19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
     20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
     21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
     22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
     23            "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
     24            "StreamReader", "StreamWriter",
     25            "StreamReaderWriter", "StreamRecoder",
     26            "getencoder", "getdecoder", "getincrementalencoder",
     27            "getincrementaldecoder", "getreader", "getwriter",
     28            "encode", "decode", "iterencode", "iterdecode",
     29            "strict_errors", "ignore_errors", "replace_errors",
     30            "xmlcharrefreplace_errors", "backslashreplace_errors",
     31            "register_error", "lookup_error"]
     32 
     33 ### Constants
     34 
     35 #
     36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
     37 # and its possible byte string values
     38 # for UTF8/UTF16/UTF32 output and little/big endian machines
     39 #
     40 
     41 # UTF-8
     42 BOM_UTF8 = '\xef\xbb\xbf'
     43 
     44 # UTF-16, little endian
     45 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
     46 
     47 # UTF-16, big endian
     48 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
     49 
     50 # UTF-32, little endian
     51 BOM_UTF32_LE = '\xff\xfe\x00\x00'
     52 
     53 # UTF-32, big endian
     54 BOM_UTF32_BE = '\x00\x00\xfe\xff'
     55 
     56 if sys.byteorder == 'little':
     57 
     58     # UTF-16, native endianness
     59     BOM = BOM_UTF16 = BOM_UTF16_LE
     60 
     61     # UTF-32, native endianness
     62     BOM_UTF32 = BOM_UTF32_LE
     63 
     64 else:
     65 
     66     # UTF-16, native endianness
     67     BOM = BOM_UTF16 = BOM_UTF16_BE
     68 
     69     # UTF-32, native endianness
     70     BOM_UTF32 = BOM_UTF32_BE
     71 
     72 # Old broken names (don't use in new code)
     73 BOM32_LE = BOM_UTF16_LE
     74 BOM32_BE = BOM_UTF16_BE
     75 BOM64_LE = BOM_UTF32_LE
     76 BOM64_BE = BOM_UTF32_BE
     77 
     78 
     79 ### Codec base classes (defining the API)
     80 
     81 class CodecInfo(tuple):
     82     """Codec details when looking up the codec registry"""
     83 
     84     # Private API to allow Python to blacklist the known non-Unicode
     85     # codecs in the standard library. A more general mechanism to
     86     # reliably distinguish test encodings from other codecs will hopefully
     87     # be defined for Python 3.5
     88     #
     89     # See http://bugs.python.org/issue19619
     90     _is_text_encoding = True # Assume codecs are text encodings by default
     91 
     92     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
     93         incrementalencoder=None, incrementaldecoder=None, name=None,
     94         _is_text_encoding=None):
     95         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
     96         self.name = name
     97         self.encode = encode
     98         self.decode = decode
     99         self.incrementalencoder = incrementalencoder
    100         self.incrementaldecoder = incrementaldecoder
    101         self.streamwriter = streamwriter
    102         self.streamreader = streamreader
    103         if _is_text_encoding is not None:
    104             self._is_text_encoding = _is_text_encoding
    105         return self
    106 
    107     def __repr__(self):
    108         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
    109 
    110 class Codec:
    111 
    112     """ Defines the interface for stateless encoders/decoders.
    113 
    114         The .encode()/.decode() methods may use different error
    115         handling schemes by providing the errors argument. These
    116         string values are predefined:
    117 
    118          'strict' - raise a ValueError error (or a subclass)
    119          'ignore' - ignore the character and continue with the next
    120          'replace' - replace with a suitable replacement character;
    121                     Python will use the official U+FFFD REPLACEMENT
    122                     CHARACTER for the builtin Unicode codecs on
    123                     decoding and '?' on encoding.
    124          'xmlcharrefreplace' - Replace with the appropriate XML
    125                                character reference (only for encoding).
    126          'backslashreplace'  - Replace with backslashed escape sequences
    127                                (only for encoding).
    128 
    129         The set of allowed values can be extended via register_error.
    130 
    131     """
    132     def encode(self, input, errors='strict'):
    133 
    134         """ Encodes the object input and returns a tuple (output
    135             object, length consumed).
    136 
    137             errors defines the error handling to apply. It defaults to
    138             'strict' handling.
    139 
    140             The method may not store state in the Codec instance. Use
    141             StreamWriter for codecs which have to keep state in order to
    142             make encoding efficient.
    143 
    144             The encoder must be able to handle zero length input and
    145             return an empty object of the output object type in this
    146             situation.
    147 
    148         """
    149         raise NotImplementedError
    150 
    151     def decode(self, input, errors='strict'):
    152 
    153         """ Decodes the object input and returns a tuple (output
    154             object, length consumed).
    155 
    156             input must be an object which provides the bf_getreadbuf
    157             buffer slot. Python strings, buffer objects and memory
    158             mapped files are examples of objects providing this slot.
    159 
    160             errors defines the error handling to apply. It defaults to
    161             'strict' handling.
    162 
    163             The method may not store state in the Codec instance. Use
    164             StreamReader for codecs which have to keep state in order to
    165             make decoding efficient.
    166 
    167             The decoder must be able to handle zero length input and
    168             return an empty object of the output object type in this
    169             situation.
    170 
    171         """
    172         raise NotImplementedError
    173 
    174 class IncrementalEncoder(object):
    175     """
    176     An IncrementalEncoder encodes an input in multiple steps. The input can be
    177     passed piece by piece to the encode() method. The IncrementalEncoder remembers
    178     the state of the Encoding process between calls to encode().
    179     """
    180     def __init__(self, errors='strict'):
    181         """
    182         Creates an IncrementalEncoder instance.
    183 
    184         The IncrementalEncoder may use different error handling schemes by
    185         providing the errors keyword argument. See the module docstring
    186         for a list of possible values.
    187         """
    188         self.errors = errors
    189         self.buffer = ""
    190 
    191     def encode(self, input, final=False):
    192         """
    193         Encodes input and returns the resulting object.
    194         """
    195         raise NotImplementedError
    196 
    197     def reset(self):
    198         """
    199         Resets the encoder to the initial state.
    200         """
    201 
    202     def getstate(self):
    203         """
    204         Return the current state of the encoder.
    205         """
    206         return 0
    207 
    208     def setstate(self, state):
    209         """
    210         Set the current state of the encoder. state must have been
    211         returned by getstate().
    212         """
    213 
    214 class BufferedIncrementalEncoder(IncrementalEncoder):
    215     """
    216     This subclass of IncrementalEncoder can be used as the baseclass for an
    217     incremental encoder if the encoder must keep some of the output in a
    218     buffer between calls to encode().
    219     """
    220     def __init__(self, errors='strict'):
    221         IncrementalEncoder.__init__(self, errors)
    222         self.buffer = "" # unencoded input that is kept between calls to encode()
    223 
    224     def _buffer_encode(self, input, errors, final):
    225         # Overwrite this method in subclasses: It must encode input
    226         # and return an (output, length consumed) tuple
    227         raise NotImplementedError
    228 
    229     def encode(self, input, final=False):
    230         # encode input (taking the buffer into account)
    231         data = self.buffer + input
    232         (result, consumed) = self._buffer_encode(data, self.errors, final)
    233         # keep unencoded input until the next call
    234         self.buffer = data[consumed:]
    235         return result
    236 
    237     def reset(self):
    238         IncrementalEncoder.reset(self)
    239         self.buffer = ""
    240 
    241     def getstate(self):
    242         return self.buffer or 0
    243 
    244     def setstate(self, state):
    245         self.buffer = state or ""
    246 
    247 class IncrementalDecoder(object):
    248     """
    249     An IncrementalDecoder decodes an input in multiple steps. The input can be
    250     passed piece by piece to the decode() method. The IncrementalDecoder
    251     remembers the state of the decoding process between calls to decode().
    252     """
    253     def __init__(self, errors='strict'):
    254         """
    255         Creates an IncrementalDecoder instance.
    256 
    257         The IncrementalDecoder may use different error handling schemes by
    258         providing the errors keyword argument. See the module docstring
    259         for a list of possible values.
    260         """
    261         self.errors = errors
    262 
    263     def decode(self, input, final=False):
    264         """
    265         Decodes input and returns the resulting object.
    266         """
    267         raise NotImplementedError
    268 
    269     def reset(self):
    270         """
    271         Resets the decoder to the initial state.
    272         """
    273 
    274     def getstate(self):
    275         """
    276         Return the current state of the decoder.
    277 
    278         This must be a (buffered_input, additional_state_info) tuple.
    279         buffered_input must be a bytes object containing bytes that
    280         were passed to decode() that have not yet been converted.
    281         additional_state_info must be a non-negative integer
    282         representing the state of the decoder WITHOUT yet having
    283         processed the contents of buffered_input.  In the initial state
    284         and after reset(), getstate() must return (b"", 0).
    285         """
    286         return (b"", 0)
    287 
    288     def setstate(self, state):
    289         """
    290         Set the current state of the decoder.
    291 
    292         state must have been returned by getstate().  The effect of
    293         setstate((b"", 0)) must be equivalent to reset().
    294         """
    295 
    296 class BufferedIncrementalDecoder(IncrementalDecoder):
    297     """
    298     This subclass of IncrementalDecoder can be used as the baseclass for an
    299     incremental decoder if the decoder must be able to handle incomplete byte
    300     sequences.
    301     """
    302     def __init__(self, errors='strict'):
    303         IncrementalDecoder.__init__(self, errors)
    304         self.buffer = "" # undecoded input that is kept between calls to decode()
    305 
    306     def _buffer_decode(self, input, errors, final):
    307         # Overwrite this method in subclasses: It must decode input
    308         # and return an (output, length consumed) tuple
    309         raise NotImplementedError
    310 
    311     def decode(self, input, final=False):
    312         # decode input (taking the buffer into account)
    313         data = self.buffer + input
    314         (result, consumed) = self._buffer_decode(data, self.errors, final)
    315         # keep undecoded input until the next call
    316         self.buffer = data[consumed:]
    317         return result
    318 
    319     def reset(self):
    320         IncrementalDecoder.reset(self)
    321         self.buffer = ""
    322 
    323     def getstate(self):
    324         # additional state info is always 0
    325         return (self.buffer, 0)
    326 
    327     def setstate(self, state):
    328         # ignore additional state info
    329         self.buffer = state[0]
    330 
    331 #
    332 # The StreamWriter and StreamReader class provide generic working
    333 # interfaces which can be used to implement new encoding submodules
    334 # very easily. See encodings/utf_8.py for an example on how this is
    335 # done.
    336 #
    337 
    338 class StreamWriter(Codec):
    339 
    340     def __init__(self, stream, errors='strict'):
    341 
    342         """ Creates a StreamWriter instance.
    343 
    344             stream must be a file-like object open for writing
    345             (binary) data.
    346 
    347             The StreamWriter may use different error handling
    348             schemes by providing the errors keyword argument. These
    349             parameters are predefined:
    350 
    351              'strict' - raise a ValueError (or a subclass)
    352              'ignore' - ignore the character and continue with the next
    353              'replace'- replace with a suitable replacement character
    354              'xmlcharrefreplace' - Replace with the appropriate XML
    355                                    character reference.
    356              'backslashreplace'  - Replace with backslashed escape
    357                                    sequences (only for encoding).
    358 
    359             The set of allowed parameter values can be extended via
    360             register_error.
    361         """
    362         self.stream = stream
    363         self.errors = errors
    364 
    365     def write(self, object):
    366 
    367         """ Writes the object's contents encoded to self.stream.
    368         """
    369         data, consumed = self.encode(object, self.errors)
    370         self.stream.write(data)
    371 
    372     def writelines(self, list):
    373 
    374         """ Writes the concatenated list of strings to the stream
    375             using .write().
    376         """
    377         self.write(''.join(list))
    378 
    379     def reset(self):
    380 
    381         """ Flushes and resets the codec buffers used for keeping state.
    382 
    383             Calling this method should ensure that the data on the
    384             output is put into a clean state, that allows appending
    385             of new fresh data without having to rescan the whole
    386             stream to recover state.
    387 
    388         """
    389         pass
    390 
    391     def seek(self, offset, whence=0):
    392         self.stream.seek(offset, whence)
    393         if whence == 0 and offset == 0:
    394             self.reset()
    395 
    396     def __getattr__(self, name,
    397                     getattr=getattr):
    398 
    399         """ Inherit all other methods from the underlying stream.
    400         """
    401         return getattr(self.stream, name)
    402 
    403     def __enter__(self):
    404         return self
    405 
    406     def __exit__(self, type, value, tb):
    407         self.stream.close()
    408 
    409 ###
    410 
    411 class StreamReader(Codec):
    412 
    413     def __init__(self, stream, errors='strict'):
    414 
    415         """ Creates a StreamReader instance.
    416 
    417             stream must be a file-like object open for reading
    418             (binary) data.
    419 
    420             The StreamReader may use different error handling
    421             schemes by providing the errors keyword argument. These
    422             parameters are predefined:
    423 
    424              'strict' - raise a ValueError (or a subclass)
    425              'ignore' - ignore the character and continue with the next
    426              'replace'- replace with a suitable replacement character;
    427 
    428             The set of allowed parameter values can be extended via
    429             register_error.
    430         """
    431         self.stream = stream
    432         self.errors = errors
    433         self.bytebuffer = ""
    434         # For str->str decoding this will stay a str
    435         # For str->unicode decoding the first read will promote it to unicode
    436         self.charbuffer = ""
    437         self.linebuffer = None
    438 
    439     def decode(self, input, errors='strict'):
    440         raise NotImplementedError
    441 
    442     def read(self, size=-1, chars=-1, firstline=False):
    443 
    444         """ Decodes data from the stream self.stream and returns the
    445             resulting object.
    446 
    447             chars indicates the number of characters to read from the
    448             stream. read() will never return more than chars
    449             characters, but it might return less, if there are not enough
    450             characters available.
    451 
    452             size indicates the approximate maximum number of bytes to
    453             read from the stream for decoding purposes. The decoder
    454             can modify this setting as appropriate. The default value
    455             -1 indicates to read and decode as much as possible.  size
    456             is intended to prevent having to decode huge files in one
    457             step.
    458 
    459             If firstline is true, and a UnicodeDecodeError happens
    460             after the first line terminator in the input only the first line
    461             will be returned, the rest of the input will be kept until the
    462             next call to read().
    463 
    464             The method should use a greedy read strategy meaning that
    465             it should read as much data as is allowed within the
    466             definition of the encoding and the given size, e.g.  if
    467             optional encoding endings or state markers are available
    468             on the stream, these should be read too.
    469         """
    470         # If we have lines cached, first merge them back into characters
    471         if self.linebuffer:
    472             self.charbuffer = "".join(self.linebuffer)
    473             self.linebuffer = None
    474 
    475         if chars < 0:
    476             # For compatibility with other read() methods that take a
    477             # single argument
    478             chars = size
    479 
    480         # read until we get the required number of characters (if available)
    481         while True:
    482             # can the request be satisfied from the character buffer?
    483             if chars >= 0:
    484                 if len(self.charbuffer) >= chars:
    485                     break
    486             # we need more data
    487             if size < 0:
    488                 newdata = self.stream.read()
    489             else:
    490                 newdata = self.stream.read(size)
    491             # decode bytes (those remaining from the last call included)
    492             data = self.bytebuffer + newdata
    493             try:
    494                 newchars, decodedbytes = self.decode(data, self.errors)
    495             except UnicodeDecodeError, exc:
    496                 if firstline:
    497                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
    498                     lines = newchars.splitlines(True)
    499                     if len(lines)<=1:
    500                         raise
    501                 else:
    502                     raise
    503             # keep undecoded bytes until the next call
    504             self.bytebuffer = data[decodedbytes:]
    505             # put new characters in the character buffer
    506             self.charbuffer += newchars
    507             # there was no data available
    508             if not newdata:
    509                 break
    510         if chars < 0:
    511             # Return everything we've got
    512             result = self.charbuffer
    513             self.charbuffer = ""
    514         else:
    515             # Return the first chars characters
    516             result = self.charbuffer[:chars]
    517             self.charbuffer = self.charbuffer[chars:]
    518         return result
    519 
    520     def readline(self, size=None, keepends=True):
    521 
    522         """ Read one line from the input stream and return the
    523             decoded data.
    524 
    525             size, if given, is passed as size argument to the
    526             read() method.
    527 
    528         """
    529         # If we have lines cached from an earlier read, return
    530         # them unconditionally
    531         if self.linebuffer:
    532             line = self.linebuffer[0]
    533             del self.linebuffer[0]
    534             if len(self.linebuffer) == 1:
    535                 # revert to charbuffer mode; we might need more data
    536                 # next time
    537                 self.charbuffer = self.linebuffer[0]
    538                 self.linebuffer = None
    539             if not keepends:
    540                 line = line.splitlines(False)[0]
    541             return line
    542 
    543         readsize = size or 72
    544         line = ""
    545         # If size is given, we call read() only once
    546         while True:
    547             data = self.read(readsize, firstline=True)
    548             if data:
    549                 # If we're at a "\r" read one extra character (which might
    550                 # be a "\n") to get a proper line ending. If the stream is
    551                 # temporarily exhausted we return the wrong line ending.
    552                 if data.endswith("\r"):
    553                     data += self.read(size=1, chars=1)
    554 
    555             line += data
    556             lines = line.splitlines(True)
    557             if lines:
    558                 if len(lines) > 1:
    559                     # More than one line result; the first line is a full line
    560                     # to return
    561                     line = lines[0]
    562                     del lines[0]
    563                     if len(lines) > 1:
    564                         # cache the remaining lines
    565                         lines[-1] += self.charbuffer
    566                         self.linebuffer = lines
    567                         self.charbuffer = None
    568                     else:
    569                         # only one remaining line, put it back into charbuffer
    570                         self.charbuffer = lines[0] + self.charbuffer
    571                     if not keepends:
    572                         line = line.splitlines(False)[0]
    573                     break
    574                 line0withend = lines[0]
    575                 line0withoutend = lines[0].splitlines(False)[0]
    576                 if line0withend != line0withoutend: # We really have a line end
    577                     # Put the rest back together and keep it until the next call
    578                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
    579                     if keepends:
    580                         line = line0withend
    581                     else:
    582                         line = line0withoutend
    583                     break
    584             # we didn't get anything or this was our only try
    585             if not data or size is not None:
    586                 if line and not keepends:
    587                     line = line.splitlines(False)[0]
    588                 break
    589             if readsize<8000:
    590                 readsize *= 2
    591         return line
    592 
    593     def readlines(self, sizehint=None, keepends=True):
    594 
    595         """ Read all lines available on the input stream
    596             and return them as list of lines.
    597 
    598             Line breaks are implemented using the codec's decoder
    599             method and are included in the list entries.
    600 
    601             sizehint, if given, is ignored since there is no efficient
    602             way to finding the true end-of-line.
    603 
    604         """
    605         data = self.read()
    606         return data.splitlines(keepends)
    607 
    608     def reset(self):
    609 
    610         """ Resets the codec buffers used for keeping state.
    611 
    612             Note that no stream repositioning should take place.
    613             This method is primarily intended to be able to recover
    614             from decoding errors.
    615 
    616         """
    617         self.bytebuffer = ""
    618         self.charbuffer = u""
    619         self.linebuffer = None
    620 
    621     def seek(self, offset, whence=0):
    622         """ Set the input stream's current position.
    623 
    624             Resets the codec buffers used for keeping state.
    625         """
    626         self.stream.seek(offset, whence)
    627         self.reset()
    628 
    629     def next(self):
    630 
    631         """ Return the next decoded line from the input stream."""
    632         line = self.readline()
    633         if line:
    634             return line
    635         raise StopIteration
    636 
    637     def __iter__(self):
    638         return self
    639 
    640     def __getattr__(self, name,
    641                     getattr=getattr):
    642 
    643         """ Inherit all other methods from the underlying stream.
    644         """
    645         return getattr(self.stream, name)
    646 
    647     def __enter__(self):
    648         return self
    649 
    650     def __exit__(self, type, value, tb):
    651         self.stream.close()
    652 
    653 ###
    654 
    655 class StreamReaderWriter:
    656 
    657     """ StreamReaderWriter instances allow wrapping streams which
    658         work in both read and write modes.
    659 
    660         The design is such that one can use the factory functions
    661         returned by the codec.lookup() function to construct the
    662         instance.
    663 
    664     """
    665     # Optional attributes set by the file wrappers below
    666     encoding = 'unknown'
    667 
    668     def __init__(self, stream, Reader, Writer, errors='strict'):
    669 
    670         """ Creates a StreamReaderWriter instance.
    671 
    672             stream must be a Stream-like object.
    673 
    674             Reader, Writer must be factory functions or classes
    675             providing the StreamReader, StreamWriter interface resp.
    676 
    677             Error handling is done in the same way as defined for the
    678             StreamWriter/Readers.
    679 
    680         """
    681         self.stream = stream
    682         self.reader = Reader(stream, errors)
    683         self.writer = Writer(stream, errors)
    684         self.errors = errors
    685 
    686     def read(self, size=-1):
    687 
    688         return self.reader.read(size)
    689 
    690     def readline(self, size=None):
    691 
    692         return self.reader.readline(size)
    693 
    694     def readlines(self, sizehint=None):
    695 
    696         return self.reader.readlines(sizehint)
    697 
    698     def next(self):
    699 
    700         """ Return the next decoded line from the input stream."""
    701         return self.reader.next()
    702 
    703     def __iter__(self):
    704         return self
    705 
    706     def write(self, data):
    707 
    708         return self.writer.write(data)
    709 
    710     def writelines(self, list):
    711 
    712         return self.writer.writelines(list)
    713 
    714     def reset(self):
    715 
    716         self.reader.reset()
    717         self.writer.reset()
    718 
    719     def seek(self, offset, whence=0):
    720         self.stream.seek(offset, whence)
    721         self.reader.reset()
    722         if whence == 0 and offset == 0:
    723             self.writer.reset()
    724 
    725     def __getattr__(self, name,
    726                     getattr=getattr):
    727 
    728         """ Inherit all other methods from the underlying stream.
    729         """
    730         return getattr(self.stream, name)
    731 
    732     # these are needed to make "with codecs.open(...)" work properly
    733 
    734     def __enter__(self):
    735         return self
    736 
    737     def __exit__(self, type, value, tb):
    738         self.stream.close()
    739 
    740 ###
    741 
    742 class StreamRecoder:
    743 
    744     """ StreamRecoder instances provide a frontend - backend
    745         view of encoding data.
    746 
    747         They use the complete set of APIs returned by the
    748         codecs.lookup() function to implement their task.
    749 
    750         Data written to the stream is first decoded into an
    751         intermediate format (which is dependent on the given codec
    752         combination) and then written to the stream using an instance
    753         of the provided Writer class.
    754 
    755         In the other direction, data is read from the stream using a
    756         Reader instance and then return encoded data to the caller.
    757 
    758     """
    759     # Optional attributes set by the file wrappers below
    760     data_encoding = 'unknown'
    761     file_encoding = 'unknown'
    762 
    763     def __init__(self, stream, encode, decode, Reader, Writer,
    764                  errors='strict'):
    765 
    766         """ Creates a StreamRecoder instance which implements a two-way
    767             conversion: encode and decode work on the frontend (the
    768             input to .read() and output of .write()) while
    769             Reader and Writer work on the backend (reading and
    770             writing to the stream).
    771 
    772             You can use these objects to do transparent direct
    773             recodings from e.g. latin-1 to utf-8 and back.
    774 
    775             stream must be a file-like object.
    776 
    777             encode, decode must adhere to the Codec interface, Reader,
    778             Writer must be factory functions or classes providing the
    779             StreamReader, StreamWriter interface resp.
    780 
    781             encode and decode are needed for the frontend translation,
    782             Reader and Writer for the backend translation. Unicode is
    783             used as intermediate encoding.
    784 
    785             Error handling is done in the same way as defined for the
    786             StreamWriter/Readers.
    787 
    788         """
    789         self.stream = stream
    790         self.encode = encode
    791         self.decode = decode
    792         self.reader = Reader(stream, errors)
    793         self.writer = Writer(stream, errors)
    794         self.errors = errors
    795 
    796     def read(self, size=-1):
    797 
    798         data = self.reader.read(size)
    799         data, bytesencoded = self.encode(data, self.errors)
    800         return data
    801 
    802     def readline(self, size=None):
    803 
    804         if size is None:
    805             data = self.reader.readline()
    806         else:
    807             data = self.reader.readline(size)
    808         data, bytesencoded = self.encode(data, self.errors)
    809         return data
    810 
    811     def readlines(self, sizehint=None):
    812 
    813         data = self.reader.read()
    814         data, bytesencoded = self.encode(data, self.errors)
    815         return data.splitlines(1)
    816 
    817     def next(self):
    818 
    819         """ Return the next decoded line from the input stream."""
    820         data = self.reader.next()
    821         data, bytesencoded = self.encode(data, self.errors)
    822         return data
    823 
    824     def __iter__(self):
    825         return self
    826 
    827     def write(self, data):
    828 
    829         data, bytesdecoded = self.decode(data, self.errors)
    830         return self.writer.write(data)
    831 
    832     def writelines(self, list):
    833 
    834         data = ''.join(list)
    835         data, bytesdecoded = self.decode(data, self.errors)
    836         return self.writer.write(data)
    837 
    838     def reset(self):
    839 
    840         self.reader.reset()
    841         self.writer.reset()
    842 
    843     def __getattr__(self, name,
    844                     getattr=getattr):
    845 
    846         """ Inherit all other methods from the underlying stream.
    847         """
    848         return getattr(self.stream, name)
    849 
    850     def __enter__(self):
    851         return self
    852 
    853     def __exit__(self, type, value, tb):
    854         self.stream.close()
    855 
    856 ### Shortcuts
    857 
    858 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
    859 
    860     """ Open an encoded file using the given mode and return
    861         a wrapped version providing transparent encoding/decoding.
    862 
    863         Note: The wrapped version will only accept the object format
    864         defined by the codecs, i.e. Unicode objects for most builtin
    865         codecs. Output is also codec dependent and will usually be
    866         Unicode as well.
    867 
    868         Files are always opened in binary mode, even if no binary mode
    869         was specified. This is done to avoid data loss due to encodings
    870         using 8-bit values. The default file mode is 'rb' meaning to
    871         open the file in binary read mode.
    872 
    873         encoding specifies the encoding which is to be used for the
    874         file.
    875 
    876         errors may be given to define the error handling. It defaults
    877         to 'strict' which causes ValueErrors to be raised in case an
    878         encoding error occurs.
    879 
    880         buffering has the same meaning as for the builtin open() API.
    881         It defaults to line buffered.
    882 
    883         The returned wrapped file object provides an extra attribute
    884         .encoding which allows querying the used encoding. This
    885         attribute is only available if an encoding was specified as
    886         parameter.
    887 
    888     """
    889     if encoding is not None:
    890         if 'U' in mode:
    891             # No automatic conversion of '\n' is done on reading and writing
    892             mode = mode.strip().replace('U', '')
    893             if mode[:1] not in set('rwa'):
    894                 mode = 'r' + mode
    895         if 'b' not in mode:
    896             # Force opening of the file in binary mode
    897             mode = mode + 'b'
    898     file = __builtin__.open(filename, mode, buffering)
    899     if encoding is None:
    900         return file
    901     info = lookup(encoding)
    902     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    903     # Add attributes to simplify introspection
    904     srw.encoding = encoding
    905     return srw
    906 
    907 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    908 
    909     """ Return a wrapped version of file which provides transparent
    910         encoding translation.
    911 
    912         Strings written to the wrapped file are interpreted according
    913         to the given data_encoding and then written to the original
    914         file as string using file_encoding. The intermediate encoding
    915         will usually be Unicode but depends on the specified codecs.
    916 
    917         Strings are read from the file using file_encoding and then
    918         passed back to the caller as string using data_encoding.
    919 
    920         If file_encoding is not given, it defaults to data_encoding.
    921 
    922         errors may be given to define the error handling. It defaults
    923         to 'strict' which causes ValueErrors to be raised in case an
    924         encoding error occurs.
    925 
    926         The returned wrapped file object provides two extra attributes
    927         .data_encoding and .file_encoding which reflect the given
    928         parameters of the same name. The attributes can be used for
    929         introspection by Python programs.
    930 
    931     """
    932     if file_encoding is None:
    933         file_encoding = data_encoding
    934     data_info = lookup(data_encoding)
    935     file_info = lookup(file_encoding)
    936     sr = StreamRecoder(file, data_info.encode, data_info.decode,
    937                        file_info.streamreader, file_info.streamwriter, errors)
    938     # Add attributes to simplify introspection
    939     sr.data_encoding = data_encoding
    940     sr.file_encoding = file_encoding
    941     return sr
    942 
    943 ### Helpers for codec lookup
    944 
    945 def getencoder(encoding):
    946 
    947     """ Lookup up the codec for the given encoding and return
    948         its encoder function.
    949 
    950         Raises a LookupError in case the encoding cannot be found.
    951 
    952     """
    953     return lookup(encoding).encode
    954 
    955 def getdecoder(encoding):
    956 
    957     """ Lookup up the codec for the given encoding and return
    958         its decoder function.
    959 
    960         Raises a LookupError in case the encoding cannot be found.
    961 
    962     """
    963     return lookup(encoding).decode
    964 
    965 def getincrementalencoder(encoding):
    966 
    967     """ Lookup up the codec for the given encoding and return
    968         its IncrementalEncoder class or factory function.
    969 
    970         Raises a LookupError in case the encoding cannot be found
    971         or the codecs doesn't provide an incremental encoder.
    972 
    973     """
    974     encoder = lookup(encoding).incrementalencoder
    975     if encoder is None:
    976         raise LookupError(encoding)
    977     return encoder
    978 
    979 def getincrementaldecoder(encoding):
    980 
    981     """ Lookup up the codec for the given encoding and return
    982         its IncrementalDecoder class or factory function.
    983 
    984         Raises a LookupError in case the encoding cannot be found
    985         or the codecs doesn't provide an incremental decoder.
    986 
    987     """
    988     decoder = lookup(encoding).incrementaldecoder
    989     if decoder is None:
    990         raise LookupError(encoding)
    991     return decoder
    992 
    993 def getreader(encoding):
    994 
    995     """ Lookup up the codec for the given encoding and return
    996         its StreamReader class or factory function.
    997 
    998         Raises a LookupError in case the encoding cannot be found.
    999 
   1000     """
   1001     return lookup(encoding).streamreader
   1002 
   1003 def getwriter(encoding):
   1004 
   1005     """ Lookup up the codec for the given encoding and return
   1006         its StreamWriter class or factory function.
   1007 
   1008         Raises a LookupError in case the encoding cannot be found.
   1009 
   1010     """
   1011     return lookup(encoding).streamwriter
   1012 
   1013 def iterencode(iterator, encoding, errors='strict', **kwargs):
   1014     """
   1015     Encoding iterator.
   1016 
   1017     Encodes the input strings from the iterator using an IncrementalEncoder.
   1018 
   1019     errors and kwargs are passed through to the IncrementalEncoder
   1020     constructor.
   1021     """
   1022     encoder = getincrementalencoder(encoding)(errors, **kwargs)
   1023     for input in iterator:
   1024         output = encoder.encode(input)
   1025         if output:
   1026             yield output
   1027     output = encoder.encode("", True)
   1028     if output:
   1029         yield output
   1030 
   1031 def iterdecode(iterator, encoding, errors='strict', **kwargs):
   1032     """
   1033     Decoding iterator.
   1034 
   1035     Decodes the input strings from the iterator using an IncrementalDecoder.
   1036 
   1037     errors and kwargs are passed through to the IncrementalDecoder
   1038     constructor.
   1039     """
   1040     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
   1041     for input in iterator:
   1042         output = decoder.decode(input)
   1043         if output:
   1044             yield output
   1045     output = decoder.decode("", True)
   1046     if output:
   1047         yield output
   1048 
   1049 ### Helpers for charmap-based codecs
   1050 
   1051 def make_identity_dict(rng):
   1052 
   1053     """ make_identity_dict(rng) -> dict
   1054 
   1055         Return a dictionary where elements of the rng sequence are
   1056         mapped to themselves.
   1057 
   1058     """
   1059     res = {}
   1060     for i in rng:
   1061         res[i]=i
   1062     return res
   1063 
   1064 def make_encoding_map(decoding_map):
   1065 
   1066     """ Creates an encoding map from a decoding map.
   1067 
   1068         If a target mapping in the decoding map occurs multiple
   1069         times, then that target is mapped to None (undefined mapping),
   1070         causing an exception when encountered by the charmap codec
   1071         during translation.
   1072 
   1073         One example where this happens is cp875.py which decodes
   1074         multiple character to \\u001a.
   1075 
   1076     """
   1077     m = {}
   1078     for k,v in decoding_map.items():
   1079         if not v in m:
   1080             m[v] = k
   1081         else:
   1082             m[v] = None
   1083     return m
   1084 
   1085 ### error handlers
   1086 
   1087 try:
   1088     strict_errors = lookup_error("strict")
   1089     ignore_errors = lookup_error("ignore")
   1090     replace_errors = lookup_error("replace")
   1091     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
   1092     backslashreplace_errors = lookup_error("backslashreplace")
   1093 except LookupError:
   1094     # In --disable-unicode builds, these error handler are missing
   1095     strict_errors = None
   1096     ignore_errors = None
   1097     replace_errors = None
   1098     xmlcharrefreplace_errors = None
   1099     backslashreplace_errors = None
   1100 
   1101 # Tell modulefinder that using codecs probably needs the encodings
   1102 # package
   1103 _false = 0
   1104 if _false:
   1105     import encodings
   1106 
   1107 ### Tests
   1108 
   1109 if __name__ == '__main__':
   1110 
   1111     # Make stdout translate Latin-1 output into UTF-8 output
   1112     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
   1113 
   1114     # Have stdin translate Latin-1 input into UTF-8 input
   1115     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
   1116