Home | History | Annotate | Download | only in Lib
      1 """ codecs -- Python Codec Registry, API and helpers.
      2 
      3 
      4 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import __builtin__, sys
     11 
     12 ### Registry and builtin stateless codec functions
     13 
     14 try:
     15     from _codecs import *
     16 except ImportError, why:
     17     raise SystemError('Failed to load the builtin codecs: %s' % why)
     18 
     19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
     20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
     21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
     22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
     23            "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
     24            "StreamReader", "StreamWriter",
     25            "StreamReaderWriter", "StreamRecoder",
     26            "getencoder", "getdecoder", "getincrementalencoder",
     27            "getincrementaldecoder", "getreader", "getwriter",
     28            "encode", "decode", "iterencode", "iterdecode",
     29            "strict_errors", "ignore_errors", "replace_errors",
     30            "xmlcharrefreplace_errors", "backslashreplace_errors",
     31            "register_error", "lookup_error"]
     32 
     33 ### Constants
     34 
     35 #
     36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
     37 # and its possible byte string values
     38 # for UTF8/UTF16/UTF32 output and little/big endian machines
     39 #
     40 
     41 # UTF-8
     42 BOM_UTF8 = '\xef\xbb\xbf'
     43 
     44 # UTF-16, little endian
     45 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
     46 
     47 # UTF-16, big endian
     48 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
     49 
     50 # UTF-32, little endian
     51 BOM_UTF32_LE = '\xff\xfe\x00\x00'
     52 
     53 # UTF-32, big endian
     54 BOM_UTF32_BE = '\x00\x00\xfe\xff'
     55 
     56 if sys.byteorder == 'little':
     57 
     58     # UTF-16, native endianness
     59     BOM = BOM_UTF16 = BOM_UTF16_LE
     60 
     61     # UTF-32, native endianness
     62     BOM_UTF32 = BOM_UTF32_LE
     63 
     64 else:
     65 
     66     # UTF-16, native endianness
     67     BOM = BOM_UTF16 = BOM_UTF16_BE
     68 
     69     # UTF-32, native endianness
     70     BOM_UTF32 = BOM_UTF32_BE
     71 
     72 # Old broken names (don't use in new code)
     73 BOM32_LE = BOM_UTF16_LE
     74 BOM32_BE = BOM_UTF16_BE
     75 BOM64_LE = BOM_UTF32_LE
     76 BOM64_BE = BOM_UTF32_BE
     77 
     78 
     79 ### Codec base classes (defining the API)
     80 
     81 class CodecInfo(tuple):
     82     """Codec details when looking up the codec registry"""
     83 
     84     # Private API to allow Python to blacklist the known non-Unicode
     85     # codecs in the standard library. A more general mechanism to
     86     # reliably distinguish test encodings from other codecs will hopefully
     87     # be defined for Python 3.5
     88     #
     89     # See http://bugs.python.org/issue19619
     90     _is_text_encoding = True # Assume codecs are text encodings by default
     91 
     92     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
     93         incrementalencoder=None, incrementaldecoder=None, name=None,
     94         _is_text_encoding=None):
     95         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
     96         self.name = name
     97         self.encode = encode
     98         self.decode = decode
     99         self.incrementalencoder = incrementalencoder
    100         self.incrementaldecoder = incrementaldecoder
    101         self.streamwriter = streamwriter
    102         self.streamreader = streamreader
    103         if _is_text_encoding is not None:
    104             self._is_text_encoding = _is_text_encoding
    105         return self
    106 
    107     def __repr__(self):
    108         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
    109 
    110 class Codec:
    111 
    112     """ Defines the interface for stateless encoders/decoders.
    113 
    114         The .encode()/.decode() methods may use different error
    115         handling schemes by providing the errors argument. These
    116         string values are predefined:
    117 
    118          'strict' - raise a ValueError error (or a subclass)
    119          'ignore' - ignore the character and continue with the next
    120          'replace' - replace with a suitable replacement character;
    121                     Python will use the official U+FFFD REPLACEMENT
    122                     CHARACTER for the builtin Unicode codecs on
    123                     decoding and '?' on encoding.
    124          'xmlcharrefreplace' - Replace with the appropriate XML
    125                                character reference (only for encoding).
    126          'backslashreplace'  - Replace with backslashed escape sequences
    127                                (only for encoding).
    128 
    129         The set of allowed values can be extended via register_error.
    130 
    131     """
    132     def encode(self, input, errors='strict'):
    133 
    134         """ Encodes the object input and returns a tuple (output
    135             object, length consumed).
    136 
    137             errors defines the error handling to apply. It defaults to
    138             'strict' handling.
    139 
    140             The method may not store state in the Codec instance. Use
    141             StreamWriter for codecs which have to keep state in order to
    142             make encoding efficient.
    143 
    144             The encoder must be able to handle zero length input and
    145             return an empty object of the output object type in this
    146             situation.
    147 
    148         """
    149         raise NotImplementedError
    150 
    151     def decode(self, input, errors='strict'):
    152 
    153         """ Decodes the object input and returns a tuple (output
    154             object, length consumed).
    155 
    156             input must be an object which provides the bf_getreadbuf
    157             buffer slot. Python strings, buffer objects and memory
    158             mapped files are examples of objects providing this slot.
    159 
    160             errors defines the error handling to apply. It defaults to
    161             'strict' handling.
    162 
    163             The method may not store state in the Codec instance. Use
    164             StreamReader for codecs which have to keep state in order to
    165             make decoding efficient.
    166 
    167             The decoder must be able to handle zero length input and
    168             return an empty object of the output object type in this
    169             situation.
    170 
    171         """
    172         raise NotImplementedError
    173 
    174 class IncrementalEncoder(object):
    175     """
    176     An IncrementalEncoder encodes an input in multiple steps. The input can be
    177     passed piece by piece to the encode() method. The IncrementalEncoder remembers
    178     the state of the Encoding process between calls to encode().
    179     """
    180     def __init__(self, errors='strict'):
    181         """
    182         Creates an IncrementalEncoder instance.
    183 
    184         The IncrementalEncoder may use different error handling schemes by
    185         providing the errors keyword argument. See the module docstring
    186         for a list of possible values.
    187         """
    188         self.errors = errors
    189         self.buffer = ""
    190 
    191     def encode(self, input, final=False):
    192         """
    193         Encodes input and returns the resulting object.
    194         """
    195         raise NotImplementedError
    196 
    197     def reset(self):
    198         """
    199         Resets the encoder to the initial state.
    200         """
    201 
    202     def getstate(self):
    203         """
    204         Return the current state of the encoder.
    205         """
    206         return 0
    207 
    208     def setstate(self, state):
    209         """
    210         Set the current state of the encoder. state must have been
    211         returned by getstate().
    212         """
    213 
    214 class BufferedIncrementalEncoder(IncrementalEncoder):
    215     """
    216     This subclass of IncrementalEncoder can be used as the baseclass for an
    217     incremental encoder if the encoder must keep some of the output in a
    218     buffer between calls to encode().
    219     """
    220     def __init__(self, errors='strict'):
    221         IncrementalEncoder.__init__(self, errors)
    222         self.buffer = "" # unencoded input that is kept between calls to encode()
    223 
    224     def _buffer_encode(self, input, errors, final):
    225         # Overwrite this method in subclasses: It must encode input
    226         # and return an (output, length consumed) tuple
    227         raise NotImplementedError
    228 
    229     def encode(self, input, final=False):
    230         # encode input (taking the buffer into account)
    231         data = self.buffer + input
    232         (result, consumed) = self._buffer_encode(data, self.errors, final)
    233         # keep unencoded input until the next call
    234         self.buffer = data[consumed:]
    235         return result
    236 
    237     def reset(self):
    238         IncrementalEncoder.reset(self)
    239         self.buffer = ""
    240 
    241     def getstate(self):
    242         return self.buffer or 0
    243 
    244     def setstate(self, state):
    245         self.buffer = state or ""
    246 
    247 class IncrementalDecoder(object):
    248     """
    249     An IncrementalDecoder decodes an input in multiple steps. The input can be
    250     passed piece by piece to the decode() method. The IncrementalDecoder
    251     remembers the state of the decoding process between calls to decode().
    252     """
    253     def __init__(self, errors='strict'):
    254         """
    255         Creates an IncrementalDecoder instance.
    256 
    257         The IncrementalDecoder may use different error handling schemes by
    258         providing the errors keyword argument. See the module docstring
    259         for a list of possible values.
    260         """
    261         self.errors = errors
    262 
    263     def decode(self, input, final=False):
    264         """
    265         Decodes input and returns the resulting object.
    266         """
    267         raise NotImplementedError
    268 
    269     def reset(self):
    270         """
    271         Resets the decoder to the initial state.
    272         """
    273 
    274     def getstate(self):
    275         """
    276         Return the current state of the decoder.
    277 
    278         This must be a (buffered_input, additional_state_info) tuple.
    279         buffered_input must be a bytes object containing bytes that
    280         were passed to decode() that have not yet been converted.
    281         additional_state_info must be a non-negative integer
    282         representing the state of the decoder WITHOUT yet having
    283         processed the contents of buffered_input.  In the initial state
    284         and after reset(), getstate() must return (b"", 0).
    285         """
    286         return (b"", 0)
    287 
    288     def setstate(self, state):
    289         """
    290         Set the current state of the decoder.
    291 
    292         state must have been returned by getstate().  The effect of
    293         setstate((b"", 0)) must be equivalent to reset().
    294         """
    295 
    296 class BufferedIncrementalDecoder(IncrementalDecoder):
    297     """
    298     This subclass of IncrementalDecoder can be used as the baseclass for an
    299     incremental decoder if the decoder must be able to handle incomplete byte
    300     sequences.
    301     """
    302     def __init__(self, errors='strict'):
    303         IncrementalDecoder.__init__(self, errors)
    304         self.buffer = "" # undecoded input that is kept between calls to decode()
    305 
    306     def _buffer_decode(self, input, errors, final):
    307         # Overwrite this method in subclasses: It must decode input
    308         # and return an (output, length consumed) tuple
    309         raise NotImplementedError
    310 
    311     def decode(self, input, final=False):
    312         # decode input (taking the buffer into account)
    313         data = self.buffer + input
    314         (result, consumed) = self._buffer_decode(data, self.errors, final)
    315         # keep undecoded input until the next call
    316         self.buffer = data[consumed:]
    317         return result
    318 
    319     def reset(self):
    320         IncrementalDecoder.reset(self)
    321         self.buffer = ""
    322 
    323     def getstate(self):
    324         # additional state info is always 0
    325         return (self.buffer, 0)
    326 
    327     def setstate(self, state):
    328         # ignore additional state info
    329         self.buffer = state[0]
    330 
    331 #
    332 # The StreamWriter and StreamReader class provide generic working
    333 # interfaces which can be used to implement new encoding submodules
    334 # very easily. See encodings/utf_8.py for an example on how this is
    335 # done.
    336 #
    337 
    338 class StreamWriter(Codec):
    339 
    340     def __init__(self, stream, errors='strict'):
    341 
    342         """ Creates a StreamWriter instance.
    343 
    344             stream must be a file-like object open for writing
    345             (binary) data.
    346 
    347             The StreamWriter may use different error handling
    348             schemes by providing the errors keyword argument. These
    349             parameters are predefined:
    350 
    351              'strict' - raise a ValueError (or a subclass)
    352              'ignore' - ignore the character and continue with the next
    353              'replace'- replace with a suitable replacement character
    354              'xmlcharrefreplace' - Replace with the appropriate XML
    355                                    character reference.
    356              'backslashreplace'  - Replace with backslashed escape
    357                                    sequences (only for encoding).
    358 
    359             The set of allowed parameter values can be extended via
    360             register_error.
    361         """
    362         self.stream = stream
    363         self.errors = errors
    364 
    365     def write(self, object):
    366 
    367         """ Writes the object's contents encoded to self.stream.
    368         """
    369         data, consumed = self.encode(object, self.errors)
    370         self.stream.write(data)
    371 
    372     def writelines(self, list):
    373 
    374         """ Writes the concatenated list of strings to the stream
    375             using .write().
    376         """
    377         self.write(''.join(list))
    378 
    379     def reset(self):
    380 
    381         """ Flushes and resets the codec buffers used for keeping state.
    382 
    383             Calling this method should ensure that the data on the
    384             output is put into a clean state, that allows appending
    385             of new fresh data without having to rescan the whole
    386             stream to recover state.
    387 
    388         """
    389         pass
    390 
    391     def seek(self, offset, whence=0):
    392         self.stream.seek(offset, whence)
    393         if whence == 0 and offset == 0:
    394             self.reset()
    395 
    396     def __getattr__(self, name,
    397                     getattr=getattr):
    398 
    399         """ Inherit all other methods from the underlying stream.
    400         """
    401         return getattr(self.stream, name)
    402 
    403     def __enter__(self):
    404         return self
    405 
    406     def __exit__(self, type, value, tb):
    407         self.stream.close()
    408 
    409 ###
    410 
    411 class StreamReader(Codec):
    412 
    413     def __init__(self, stream, errors='strict'):
    414 
    415         """ Creates a StreamReader instance.
    416 
    417             stream must be a file-like object open for reading
    418             (binary) data.
    419 
    420             The StreamReader may use different error handling
    421             schemes by providing the errors keyword argument. These
    422             parameters are predefined:
    423 
    424              'strict' - raise a ValueError (or a subclass)
    425              'ignore' - ignore the character and continue with the next
    426              'replace'- replace with a suitable replacement character;
    427 
    428             The set of allowed parameter values can be extended via
    429             register_error.
    430         """
    431         self.stream = stream
    432         self.errors = errors
    433         self.bytebuffer = ""
    434         # For str->str decoding this will stay a str
    435         # For str->unicode decoding the first read will promote it to unicode
    436         self.charbuffer = ""
    437         self.linebuffer = None
    438 
    439     def decode(self, input, errors='strict'):
    440         raise NotImplementedError
    441 
    442     def read(self, size=-1, chars=-1, firstline=False):
    443 
    444         """ Decodes data from the stream self.stream and returns the
    445             resulting object.
    446 
    447             chars indicates the number of characters to read from the
    448             stream. read() will never return more than chars
    449             characters, but it might return less, if there are not enough
    450             characters available.
    451 
    452             size indicates the approximate maximum number of bytes to
    453             read from the stream for decoding purposes. The decoder
    454             can modify this setting as appropriate. The default value
    455             -1 indicates to read and decode as much as possible.  size
    456             is intended to prevent having to decode huge files in one
    457             step.
    458 
    459             If firstline is true, and a UnicodeDecodeError happens
    460             after the first line terminator in the input only the first line
    461             will be returned, the rest of the input will be kept until the
    462             next call to read().
    463 
    464             The method should use a greedy read strategy meaning that
    465             it should read as much data as is allowed within the
    466             definition of the encoding and the given size, e.g.  if
    467             optional encoding endings or state markers are available
    468             on the stream, these should be read too.
    469         """
    470         # If we have lines cached, first merge them back into characters
    471         if self.linebuffer:
    472             self.charbuffer = "".join(self.linebuffer)
    473             self.linebuffer = None
    474 
    475         # read until we get the required number of characters (if available)
    476         while True:
    477             # can the request be satisfied from the character buffer?
    478             if chars >= 0:
    479                 if len(self.charbuffer) >= chars:
    480                     break
    481             elif size >= 0:
    482                 if len(self.charbuffer) >= size:
    483                     break
    484             # we need more data
    485             if size < 0:
    486                 newdata = self.stream.read()
    487             else:
    488                 newdata = self.stream.read(size)
    489             # decode bytes (those remaining from the last call included)
    490             data = self.bytebuffer + newdata
    491             try:
    492                 newchars, decodedbytes = self.decode(data, self.errors)
    493             except UnicodeDecodeError, exc:
    494                 if firstline:
    495                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
    496                     lines = newchars.splitlines(True)
    497                     if len(lines)<=1:
    498                         raise
    499                 else:
    500                     raise
    501             # keep undecoded bytes until the next call
    502             self.bytebuffer = data[decodedbytes:]
    503             # put new characters in the character buffer
    504             self.charbuffer += newchars
    505             # there was no data available
    506             if not newdata:
    507                 break
    508         if chars < 0:
    509             # Return everything we've got
    510             result = self.charbuffer
    511             self.charbuffer = ""
    512         else:
    513             # Return the first chars characters
    514             result = self.charbuffer[:chars]
    515             self.charbuffer = self.charbuffer[chars:]
    516         return result
    517 
    518     def readline(self, size=None, keepends=True):
    519 
    520         """ Read one line from the input stream and return the
    521             decoded data.
    522 
    523             size, if given, is passed as size argument to the
    524             read() method.
    525 
    526         """
    527         # If we have lines cached from an earlier read, return
    528         # them unconditionally
    529         if self.linebuffer:
    530             line = self.linebuffer[0]
    531             del self.linebuffer[0]
    532             if len(self.linebuffer) == 1:
    533                 # revert to charbuffer mode; we might need more data
    534                 # next time
    535                 self.charbuffer = self.linebuffer[0]
    536                 self.linebuffer = None
    537             if not keepends:
    538                 line = line.splitlines(False)[0]
    539             return line
    540 
    541         readsize = size or 72
    542         line = ""
    543         # If size is given, we call read() only once
    544         while True:
    545             data = self.read(readsize, firstline=True)
    546             if data:
    547                 # If we're at a "\r" read one extra character (which might
    548                 # be a "\n") to get a proper line ending. If the stream is
    549                 # temporarily exhausted we return the wrong line ending.
    550                 if data.endswith("\r"):
    551                     data += self.read(size=1, chars=1)
    552 
    553             line += data
    554             lines = line.splitlines(True)
    555             if lines:
    556                 if len(lines) > 1:
    557                     # More than one line result; the first line is a full line
    558                     # to return
    559                     line = lines[0]
    560                     del lines[0]
    561                     if len(lines) > 1:
    562                         # cache the remaining lines
    563                         lines[-1] += self.charbuffer
    564                         self.linebuffer = lines
    565                         self.charbuffer = None
    566                     else:
    567                         # only one remaining line, put it back into charbuffer
    568                         self.charbuffer = lines[0] + self.charbuffer
    569                     if not keepends:
    570                         line = line.splitlines(False)[0]
    571                     break
    572                 line0withend = lines[0]
    573                 line0withoutend = lines[0].splitlines(False)[0]
    574                 if line0withend != line0withoutend: # We really have a line end
    575                     # Put the rest back together and keep it until the next call
    576                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
    577                     if keepends:
    578                         line = line0withend
    579                     else:
    580                         line = line0withoutend
    581                     break
    582             # we didn't get anything or this was our only try
    583             if not data or size is not None:
    584                 if line and not keepends:
    585                     line = line.splitlines(False)[0]
    586                 break
    587             if readsize<8000:
    588                 readsize *= 2
    589         return line
    590 
    591     def readlines(self, sizehint=None, keepends=True):
    592 
    593         """ Read all lines available on the input stream
    594             and return them as list of lines.
    595 
    596             Line breaks are implemented using the codec's decoder
    597             method and are included in the list entries.
    598 
    599             sizehint, if given, is ignored since there is no efficient
    600             way to finding the true end-of-line.
    601 
    602         """
    603         data = self.read()
    604         return data.splitlines(keepends)
    605 
    606     def reset(self):
    607 
    608         """ Resets the codec buffers used for keeping state.
    609 
    610             Note that no stream repositioning should take place.
    611             This method is primarily intended to be able to recover
    612             from decoding errors.
    613 
    614         """
    615         self.bytebuffer = ""
    616         self.charbuffer = u""
    617         self.linebuffer = None
    618 
    619     def seek(self, offset, whence=0):
    620         """ Set the input stream's current position.
    621 
    622             Resets the codec buffers used for keeping state.
    623         """
    624         self.stream.seek(offset, whence)
    625         self.reset()
    626 
    627     def next(self):
    628 
    629         """ Return the next decoded line from the input stream."""
    630         line = self.readline()
    631         if line:
    632             return line
    633         raise StopIteration
    634 
    635     def __iter__(self):
    636         return self
    637 
    638     def __getattr__(self, name,
    639                     getattr=getattr):
    640 
    641         """ Inherit all other methods from the underlying stream.
    642         """
    643         return getattr(self.stream, name)
    644 
    645     def __enter__(self):
    646         return self
    647 
    648     def __exit__(self, type, value, tb):
    649         self.stream.close()
    650 
    651 ###
    652 
    653 class StreamReaderWriter:
    654 
    655     """ StreamReaderWriter instances allow wrapping streams which
    656         work in both read and write modes.
    657 
    658         The design is such that one can use the factory functions
    659         returned by the codec.lookup() function to construct the
    660         instance.
    661 
    662     """
    663     # Optional attributes set by the file wrappers below
    664     encoding = 'unknown'
    665 
    666     def __init__(self, stream, Reader, Writer, errors='strict'):
    667 
    668         """ Creates a StreamReaderWriter instance.
    669 
    670             stream must be a Stream-like object.
    671 
    672             Reader, Writer must be factory functions or classes
    673             providing the StreamReader, StreamWriter interface resp.
    674 
    675             Error handling is done in the same way as defined for the
    676             StreamWriter/Readers.
    677 
    678         """
    679         self.stream = stream
    680         self.reader = Reader(stream, errors)
    681         self.writer = Writer(stream, errors)
    682         self.errors = errors
    683 
    684     def read(self, size=-1):
    685 
    686         return self.reader.read(size)
    687 
    688     def readline(self, size=None):
    689 
    690         return self.reader.readline(size)
    691 
    692     def readlines(self, sizehint=None):
    693 
    694         return self.reader.readlines(sizehint)
    695 
    696     def next(self):
    697 
    698         """ Return the next decoded line from the input stream."""
    699         return self.reader.next()
    700 
    701     def __iter__(self):
    702         return self
    703 
    704     def write(self, data):
    705 
    706         return self.writer.write(data)
    707 
    708     def writelines(self, list):
    709 
    710         return self.writer.writelines(list)
    711 
    712     def reset(self):
    713 
    714         self.reader.reset()
    715         self.writer.reset()
    716 
    717     def seek(self, offset, whence=0):
    718         self.stream.seek(offset, whence)
    719         self.reader.reset()
    720         if whence == 0 and offset == 0:
    721             self.writer.reset()
    722 
    723     def __getattr__(self, name,
    724                     getattr=getattr):
    725 
    726         """ Inherit all other methods from the underlying stream.
    727         """
    728         return getattr(self.stream, name)
    729 
    730     # these are needed to make "with codecs.open(...)" work properly
    731 
    732     def __enter__(self):
    733         return self
    734 
    735     def __exit__(self, type, value, tb):
    736         self.stream.close()
    737 
    738 ###
    739 
    740 class StreamRecoder:
    741 
    742     """ StreamRecoder instances provide a frontend - backend
    743         view of encoding data.
    744 
    745         They use the complete set of APIs returned by the
    746         codecs.lookup() function to implement their task.
    747 
    748         Data written to the stream is first decoded into an
    749         intermediate format (which is dependent on the given codec
    750         combination) and then written to the stream using an instance
    751         of the provided Writer class.
    752 
    753         In the other direction, data is read from the stream using a
    754         Reader instance and then return encoded data to the caller.
    755 
    756     """
    757     # Optional attributes set by the file wrappers below
    758     data_encoding = 'unknown'
    759     file_encoding = 'unknown'
    760 
    761     def __init__(self, stream, encode, decode, Reader, Writer,
    762                  errors='strict'):
    763 
    764         """ Creates a StreamRecoder instance which implements a two-way
    765             conversion: encode and decode work on the frontend (the
    766             input to .read() and output of .write()) while
    767             Reader and Writer work on the backend (reading and
    768             writing to the stream).
    769 
    770             You can use these objects to do transparent direct
    771             recodings from e.g. latin-1 to utf-8 and back.
    772 
    773             stream must be a file-like object.
    774 
    775             encode, decode must adhere to the Codec interface, Reader,
    776             Writer must be factory functions or classes providing the
    777             StreamReader, StreamWriter interface resp.
    778 
    779             encode and decode are needed for the frontend translation,
    780             Reader and Writer for the backend translation. Unicode is
    781             used as intermediate encoding.
    782 
    783             Error handling is done in the same way as defined for the
    784             StreamWriter/Readers.
    785 
    786         """
    787         self.stream = stream
    788         self.encode = encode
    789         self.decode = decode
    790         self.reader = Reader(stream, errors)
    791         self.writer = Writer(stream, errors)
    792         self.errors = errors
    793 
    794     def read(self, size=-1):
    795 
    796         data = self.reader.read(size)
    797         data, bytesencoded = self.encode(data, self.errors)
    798         return data
    799 
    800     def readline(self, size=None):
    801 
    802         if size is None:
    803             data = self.reader.readline()
    804         else:
    805             data = self.reader.readline(size)
    806         data, bytesencoded = self.encode(data, self.errors)
    807         return data
    808 
    809     def readlines(self, sizehint=None):
    810 
    811         data = self.reader.read()
    812         data, bytesencoded = self.encode(data, self.errors)
    813         return data.splitlines(1)
    814 
    815     def next(self):
    816 
    817         """ Return the next decoded line from the input stream."""
    818         data = self.reader.next()
    819         data, bytesencoded = self.encode(data, self.errors)
    820         return data
    821 
    822     def __iter__(self):
    823         return self
    824 
    825     def write(self, data):
    826 
    827         data, bytesdecoded = self.decode(data, self.errors)
    828         return self.writer.write(data)
    829 
    830     def writelines(self, list):
    831 
    832         data = ''.join(list)
    833         data, bytesdecoded = self.decode(data, self.errors)
    834         return self.writer.write(data)
    835 
    836     def reset(self):
    837 
    838         self.reader.reset()
    839         self.writer.reset()
    840 
    841     def __getattr__(self, name,
    842                     getattr=getattr):
    843 
    844         """ Inherit all other methods from the underlying stream.
    845         """
    846         return getattr(self.stream, name)
    847 
    848     def __enter__(self):
    849         return self
    850 
    851     def __exit__(self, type, value, tb):
    852         self.stream.close()
    853 
    854 ### Shortcuts
    855 
    856 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
    857 
    858     """ Open an encoded file using the given mode and return
    859         a wrapped version providing transparent encoding/decoding.
    860 
    861         Note: The wrapped version will only accept the object format
    862         defined by the codecs, i.e. Unicode objects for most builtin
    863         codecs. Output is also codec dependent and will usually be
    864         Unicode as well.
    865 
    866         Files are always opened in binary mode, even if no binary mode
    867         was specified. This is done to avoid data loss due to encodings
    868         using 8-bit values. The default file mode is 'rb' meaning to
    869         open the file in binary read mode.
    870 
    871         encoding specifies the encoding which is to be used for the
    872         file.
    873 
    874         errors may be given to define the error handling. It defaults
    875         to 'strict' which causes ValueErrors to be raised in case an
    876         encoding error occurs.
    877 
    878         buffering has the same meaning as for the builtin open() API.
    879         It defaults to line buffered.
    880 
    881         The returned wrapped file object provides an extra attribute
    882         .encoding which allows querying the used encoding. This
    883         attribute is only available if an encoding was specified as
    884         parameter.
    885 
    886     """
    887     if encoding is not None:
    888         if 'U' in mode:
    889             # No automatic conversion of '\n' is done on reading and writing
    890             mode = mode.strip().replace('U', '')
    891             if mode[:1] not in set('rwa'):
    892                 mode = 'r' + mode
    893         if 'b' not in mode:
    894             # Force opening of the file in binary mode
    895             mode = mode + 'b'
    896     file = __builtin__.open(filename, mode, buffering)
    897     if encoding is None:
    898         return file
    899     info = lookup(encoding)
    900     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    901     # Add attributes to simplify introspection
    902     srw.encoding = encoding
    903     return srw
    904 
    905 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    906 
    907     """ Return a wrapped version of file which provides transparent
    908         encoding translation.
    909 
    910         Strings written to the wrapped file are interpreted according
    911         to the given data_encoding and then written to the original
    912         file as string using file_encoding. The intermediate encoding
    913         will usually be Unicode but depends on the specified codecs.
    914 
    915         Strings are read from the file using file_encoding and then
    916         passed back to the caller as string using data_encoding.
    917 
    918         If file_encoding is not given, it defaults to data_encoding.
    919 
    920         errors may be given to define the error handling. It defaults
    921         to 'strict' which causes ValueErrors to be raised in case an
    922         encoding error occurs.
    923 
    924         The returned wrapped file object provides two extra attributes
    925         .data_encoding and .file_encoding which reflect the given
    926         parameters of the same name. The attributes can be used for
    927         introspection by Python programs.
    928 
    929     """
    930     if file_encoding is None:
    931         file_encoding = data_encoding
    932     data_info = lookup(data_encoding)
    933     file_info = lookup(file_encoding)
    934     sr = StreamRecoder(file, data_info.encode, data_info.decode,
    935                        file_info.streamreader, file_info.streamwriter, errors)
    936     # Add attributes to simplify introspection
    937     sr.data_encoding = data_encoding
    938     sr.file_encoding = file_encoding
    939     return sr
    940 
    941 ### Helpers for codec lookup
    942 
    943 def getencoder(encoding):
    944 
    945     """ Lookup up the codec for the given encoding and return
    946         its encoder function.
    947 
    948         Raises a LookupError in case the encoding cannot be found.
    949 
    950     """
    951     return lookup(encoding).encode
    952 
    953 def getdecoder(encoding):
    954 
    955     """ Lookup up the codec for the given encoding and return
    956         its decoder function.
    957 
    958         Raises a LookupError in case the encoding cannot be found.
    959 
    960     """
    961     return lookup(encoding).decode
    962 
    963 def getincrementalencoder(encoding):
    964 
    965     """ Lookup up the codec for the given encoding and return
    966         its IncrementalEncoder class or factory function.
    967 
    968         Raises a LookupError in case the encoding cannot be found
    969         or the codecs doesn't provide an incremental encoder.
    970 
    971     """
    972     encoder = lookup(encoding).incrementalencoder
    973     if encoder is None:
    974         raise LookupError(encoding)
    975     return encoder
    976 
    977 def getincrementaldecoder(encoding):
    978 
    979     """ Lookup up the codec for the given encoding and return
    980         its IncrementalDecoder class or factory function.
    981 
    982         Raises a LookupError in case the encoding cannot be found
    983         or the codecs doesn't provide an incremental decoder.
    984 
    985     """
    986     decoder = lookup(encoding).incrementaldecoder
    987     if decoder is None:
    988         raise LookupError(encoding)
    989     return decoder
    990 
    991 def getreader(encoding):
    992 
    993     """ Lookup up the codec for the given encoding and return
    994         its StreamReader class or factory function.
    995 
    996         Raises a LookupError in case the encoding cannot be found.
    997 
    998     """
    999     return lookup(encoding).streamreader
   1000 
   1001 def getwriter(encoding):
   1002 
   1003     """ Lookup up the codec for the given encoding and return
   1004         its StreamWriter class or factory function.
   1005 
   1006         Raises a LookupError in case the encoding cannot be found.
   1007 
   1008     """
   1009     return lookup(encoding).streamwriter
   1010 
   1011 def iterencode(iterator, encoding, errors='strict', **kwargs):
   1012     """
   1013     Encoding iterator.
   1014 
   1015     Encodes the input strings from the iterator using an IncrementalEncoder.
   1016 
   1017     errors and kwargs are passed through to the IncrementalEncoder
   1018     constructor.
   1019     """
   1020     encoder = getincrementalencoder(encoding)(errors, **kwargs)
   1021     for input in iterator:
   1022         output = encoder.encode(input)
   1023         if output:
   1024             yield output
   1025     output = encoder.encode("", True)
   1026     if output:
   1027         yield output
   1028 
   1029 def iterdecode(iterator, encoding, errors='strict', **kwargs):
   1030     """
   1031     Decoding iterator.
   1032 
   1033     Decodes the input strings from the iterator using an IncrementalDecoder.
   1034 
   1035     errors and kwargs are passed through to the IncrementalDecoder
   1036     constructor.
   1037     """
   1038     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
   1039     for input in iterator:
   1040         output = decoder.decode(input)
   1041         if output:
   1042             yield output
   1043     output = decoder.decode("", True)
   1044     if output:
   1045         yield output
   1046 
   1047 ### Helpers for charmap-based codecs
   1048 
   1049 def make_identity_dict(rng):
   1050 
   1051     """ make_identity_dict(rng) -> dict
   1052 
   1053         Return a dictionary where elements of the rng sequence are
   1054         mapped to themselves.
   1055 
   1056     """
   1057     res = {}
   1058     for i in rng:
   1059         res[i]=i
   1060     return res
   1061 
   1062 def make_encoding_map(decoding_map):
   1063 
   1064     """ Creates an encoding map from a decoding map.
   1065 
   1066         If a target mapping in the decoding map occurs multiple
   1067         times, then that target is mapped to None (undefined mapping),
   1068         causing an exception when encountered by the charmap codec
   1069         during translation.
   1070 
   1071         One example where this happens is cp875.py which decodes
   1072         multiple character to \\u001a.
   1073 
   1074     """
   1075     m = {}
   1076     for k,v in decoding_map.items():
   1077         if not v in m:
   1078             m[v] = k
   1079         else:
   1080             m[v] = None
   1081     return m
   1082 
   1083 ### error handlers
   1084 
   1085 try:
   1086     strict_errors = lookup_error("strict")
   1087     ignore_errors = lookup_error("ignore")
   1088     replace_errors = lookup_error("replace")
   1089     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
   1090     backslashreplace_errors = lookup_error("backslashreplace")
   1091 except LookupError:
   1092     # In --disable-unicode builds, these error handler are missing
   1093     strict_errors = None
   1094     ignore_errors = None
   1095     replace_errors = None
   1096     xmlcharrefreplace_errors = None
   1097     backslashreplace_errors = None
   1098 
   1099 # Tell modulefinder that using codecs probably needs the encodings
   1100 # package
   1101 _false = 0
   1102 if _false:
   1103     import encodings
   1104 
   1105 ### Tests
   1106 
   1107 if __name__ == '__main__':
   1108 
   1109     # Make stdout translate Latin-1 output into UTF-8 output
   1110     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
   1111 
   1112     # Have stdin translate Latin-1 input into UTF-8 input
   1113     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
   1114