Home | History | Annotate | Download | only in python2.7
      1 """ codecs -- Python Codec Registry, API and helpers.
      2 
      3 
      4 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import __builtin__, sys
     11 
     12 ### Registry and builtin stateless codec functions
     13 
     14 try:
     15     from _codecs import *
     16 except ImportError, why:
     17     raise SystemError('Failed to load the builtin codecs: %s' % why)
     18 
     19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
     20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
     21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
     22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
     23            "strict_errors", "ignore_errors", "replace_errors",
     24            "xmlcharrefreplace_errors",
     25            "register_error", "lookup_error"]
     26 
     27 ### Constants
     28 
     29 #
     30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
     31 # and its possible byte string values
     32 # for UTF8/UTF16/UTF32 output and little/big endian machines
     33 #
     34 
     35 # UTF-8
     36 BOM_UTF8 = '\xef\xbb\xbf'
     37 
     38 # UTF-16, little endian
     39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
     40 
     41 # UTF-16, big endian
     42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
     43 
     44 # UTF-32, little endian
     45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
     46 
     47 # UTF-32, big endian
     48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
     49 
     50 if sys.byteorder == 'little':
     51 
     52     # UTF-16, native endianness
     53     BOM = BOM_UTF16 = BOM_UTF16_LE
     54 
     55     # UTF-32, native endianness
     56     BOM_UTF32 = BOM_UTF32_LE
     57 
     58 else:
     59 
     60     # UTF-16, native endianness
     61     BOM = BOM_UTF16 = BOM_UTF16_BE
     62 
     63     # UTF-32, native endianness
     64     BOM_UTF32 = BOM_UTF32_BE
     65 
     66 # Old broken names (don't use in new code)
     67 BOM32_LE = BOM_UTF16_LE
     68 BOM32_BE = BOM_UTF16_BE
     69 BOM64_LE = BOM_UTF32_LE
     70 BOM64_BE = BOM_UTF32_BE
     71 
     72 
     73 ### Codec base classes (defining the API)
     74 
     75 class CodecInfo(tuple):
     76 
     77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
     78         incrementalencoder=None, incrementaldecoder=None, name=None):
     79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
     80         self.name = name
     81         self.encode = encode
     82         self.decode = decode
     83         self.incrementalencoder = incrementalencoder
     84         self.incrementaldecoder = incrementaldecoder
     85         self.streamwriter = streamwriter
     86         self.streamreader = streamreader
     87         return self
     88 
     89     def __repr__(self):
     90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
     91 
     92 class Codec:
     93 
     94     """ Defines the interface for stateless encoders/decoders.
     95 
     96         The .encode()/.decode() methods may use different error
     97         handling schemes by providing the errors argument. These
     98         string values are predefined:
     99 
    100          'strict' - raise a ValueError error (or a subclass)
    101          'ignore' - ignore the character and continue with the next
    102          'replace' - replace with a suitable replacement character;
    103                     Python will use the official U+FFFD REPLACEMENT
    104                     CHARACTER for the builtin Unicode codecs on
    105                     decoding and '?' on encoding.
    106          'xmlcharrefreplace' - Replace with the appropriate XML
    107                                character reference (only for encoding).
    108          'backslashreplace'  - Replace with backslashed escape sequences
    109                                (only for encoding).
    110 
    111         The set of allowed values can be extended via register_error.
    112 
    113     """
    114     def encode(self, input, errors='strict'):
    115 
    116         """ Encodes the object input and returns a tuple (output
    117             object, length consumed).
    118 
    119             errors defines the error handling to apply. It defaults to
    120             'strict' handling.
    121 
    122             The method may not store state in the Codec instance. Use
    123             StreamCodec for codecs which have to keep state in order to
    124             make encoding/decoding efficient.
    125 
    126             The encoder must be able to handle zero length input and
    127             return an empty object of the output object type in this
    128             situation.
    129 
    130         """
    131         raise NotImplementedError
    132 
    133     def decode(self, input, errors='strict'):
    134 
    135         """ Decodes the object input and returns a tuple (output
    136             object, length consumed).
    137 
    138             input must be an object which provides the bf_getreadbuf
    139             buffer slot. Python strings, buffer objects and memory
    140             mapped files are examples of objects providing this slot.
    141 
    142             errors defines the error handling to apply. It defaults to
    143             'strict' handling.
    144 
    145             The method may not store state in the Codec instance. Use
    146             StreamCodec for codecs which have to keep state in order to
    147             make encoding/decoding efficient.
    148 
    149             The decoder must be able to handle zero length input and
    150             return an empty object of the output object type in this
    151             situation.
    152 
    153         """
    154         raise NotImplementedError
    155 
    156 class IncrementalEncoder(object):
    157     """
    158     An IncrementalEncoder encodes an input in multiple steps. The input can be
    159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
    160     the state of the Encoding process between calls to encode().
    161     """
    162     def __init__(self, errors='strict'):
    163         """
    164         Creates an IncrementalEncoder instance.
    165 
    166         The IncrementalEncoder may use different error handling schemes by
    167         providing the errors keyword argument. See the module docstring
    168         for a list of possible values.
    169         """
    170         self.errors = errors
    171         self.buffer = ""
    172 
    173     def encode(self, input, final=False):
    174         """
    175         Encodes input and returns the resulting object.
    176         """
    177         raise NotImplementedError
    178 
    179     def reset(self):
    180         """
    181         Resets the encoder to the initial state.
    182         """
    183 
    184     def getstate(self):
    185         """
    186         Return the current state of the encoder.
    187         """
    188         return 0
    189 
    190     def setstate(self, state):
    191         """
    192         Set the current state of the encoder. state must have been
    193         returned by getstate().
    194         """
    195 
    196 class BufferedIncrementalEncoder(IncrementalEncoder):
    197     """
    198     This subclass of IncrementalEncoder can be used as the baseclass for an
    199     incremental encoder if the encoder must keep some of the output in a
    200     buffer between calls to encode().
    201     """
    202     def __init__(self, errors='strict'):
    203         IncrementalEncoder.__init__(self, errors)
    204         self.buffer = "" # unencoded input that is kept between calls to encode()
    205 
    206     def _buffer_encode(self, input, errors, final):
    207         # Overwrite this method in subclasses: It must encode input
    208         # and return an (output, length consumed) tuple
    209         raise NotImplementedError
    210 
    211     def encode(self, input, final=False):
    212         # encode input (taking the buffer into account)
    213         data = self.buffer + input
    214         (result, consumed) = self._buffer_encode(data, self.errors, final)
    215         # keep unencoded input until the next call
    216         self.buffer = data[consumed:]
    217         return result
    218 
    219     def reset(self):
    220         IncrementalEncoder.reset(self)
    221         self.buffer = ""
    222 
    223     def getstate(self):
    224         return self.buffer or 0
    225 
    226     def setstate(self, state):
    227         self.buffer = state or ""
    228 
    229 class IncrementalDecoder(object):
    230     """
    231     An IncrementalDecoder decodes an input in multiple steps. The input can be
    232     passed piece by piece to the decode() method. The IncrementalDecoder
    233     remembers the state of the decoding process between calls to decode().
    234     """
    235     def __init__(self, errors='strict'):
    236         """
    237         Creates a IncrementalDecoder instance.
    238 
    239         The IncrementalDecoder may use different error handling schemes by
    240         providing the errors keyword argument. See the module docstring
    241         for a list of possible values.
    242         """
    243         self.errors = errors
    244 
    245     def decode(self, input, final=False):
    246         """
    247         Decodes input and returns the resulting object.
    248         """
    249         raise NotImplementedError
    250 
    251     def reset(self):
    252         """
    253         Resets the decoder to the initial state.
    254         """
    255 
    256     def getstate(self):
    257         """
    258         Return the current state of the decoder.
    259 
    260         This must be a (buffered_input, additional_state_info) tuple.
    261         buffered_input must be a bytes object containing bytes that
    262         were passed to decode() that have not yet been converted.
    263         additional_state_info must be a non-negative integer
    264         representing the state of the decoder WITHOUT yet having
    265         processed the contents of buffered_input.  In the initial state
    266         and after reset(), getstate() must return (b"", 0).
    267         """
    268         return (b"", 0)
    269 
    270     def setstate(self, state):
    271         """
    272         Set the current state of the decoder.
    273 
    274         state must have been returned by getstate().  The effect of
    275         setstate((b"", 0)) must be equivalent to reset().
    276         """
    277 
    278 class BufferedIncrementalDecoder(IncrementalDecoder):
    279     """
    280     This subclass of IncrementalDecoder can be used as the baseclass for an
    281     incremental decoder if the decoder must be able to handle incomplete byte
    282     sequences.
    283     """
    284     def __init__(self, errors='strict'):
    285         IncrementalDecoder.__init__(self, errors)
    286         self.buffer = "" # undecoded input that is kept between calls to decode()
    287 
    288     def _buffer_decode(self, input, errors, final):
    289         # Overwrite this method in subclasses: It must decode input
    290         # and return an (output, length consumed) tuple
    291         raise NotImplementedError
    292 
    293     def decode(self, input, final=False):
    294         # decode input (taking the buffer into account)
    295         data = self.buffer + input
    296         (result, consumed) = self._buffer_decode(data, self.errors, final)
    297         # keep undecoded input until the next call
    298         self.buffer = data[consumed:]
    299         return result
    300 
    301     def reset(self):
    302         IncrementalDecoder.reset(self)
    303         self.buffer = ""
    304 
    305     def getstate(self):
    306         # additional state info is always 0
    307         return (self.buffer, 0)
    308 
    309     def setstate(self, state):
    310         # ignore additional state info
    311         self.buffer = state[0]
    312 
    313 #
    314 # The StreamWriter and StreamReader class provide generic working
    315 # interfaces which can be used to implement new encoding submodules
    316 # very easily. See encodings/utf_8.py for an example on how this is
    317 # done.
    318 #
    319 
    320 class StreamWriter(Codec):
    321 
    322     def __init__(self, stream, errors='strict'):
    323 
    324         """ Creates a StreamWriter instance.
    325 
    326             stream must be a file-like object open for writing
    327             (binary) data.
    328 
    329             The StreamWriter may use different error handling
    330             schemes by providing the errors keyword argument. These
    331             parameters are predefined:
    332 
    333              'strict' - raise a ValueError (or a subclass)
    334              'ignore' - ignore the character and continue with the next
    335              'replace'- replace with a suitable replacement character
    336              'xmlcharrefreplace' - Replace with the appropriate XML
    337                                    character reference.
    338              'backslashreplace'  - Replace with backslashed escape
    339                                    sequences (only for encoding).
    340 
    341             The set of allowed parameter values can be extended via
    342             register_error.
    343         """
    344         self.stream = stream
    345         self.errors = errors
    346 
    347     def write(self, object):
    348 
    349         """ Writes the object's contents encoded to self.stream.
    350         """
    351         data, consumed = self.encode(object, self.errors)
    352         self.stream.write(data)
    353 
    354     def writelines(self, list):
    355 
    356         """ Writes the concatenated list of strings to the stream
    357             using .write().
    358         """
    359         self.write(''.join(list))
    360 
    361     def reset(self):
    362 
    363         """ Flushes and resets the codec buffers used for keeping state.
    364 
    365             Calling this method should ensure that the data on the
    366             output is put into a clean state, that allows appending
    367             of new fresh data without having to rescan the whole
    368             stream to recover state.
    369 
    370         """
    371         pass
    372 
    373     def seek(self, offset, whence=0):
    374         self.stream.seek(offset, whence)
    375         if whence == 0 and offset == 0:
    376             self.reset()
    377 
    378     def __getattr__(self, name,
    379                     getattr=getattr):
    380 
    381         """ Inherit all other methods from the underlying stream.
    382         """
    383         return getattr(self.stream, name)
    384 
    385     def __enter__(self):
    386         return self
    387 
    388     def __exit__(self, type, value, tb):
    389         self.stream.close()
    390 
    391 ###
    392 
    393 class StreamReader(Codec):
    394 
    395     def __init__(self, stream, errors='strict'):
    396 
    397         """ Creates a StreamReader instance.
    398 
    399             stream must be a file-like object open for reading
    400             (binary) data.
    401 
    402             The StreamReader may use different error handling
    403             schemes by providing the errors keyword argument. These
    404             parameters are predefined:
    405 
    406              'strict' - raise a ValueError (or a subclass)
    407              'ignore' - ignore the character and continue with the next
    408              'replace'- replace with a suitable replacement character;
    409 
    410             The set of allowed parameter values can be extended via
    411             register_error.
    412         """
    413         self.stream = stream
    414         self.errors = errors
    415         self.bytebuffer = ""
    416         # For str->str decoding this will stay a str
    417         # For str->unicode decoding the first read will promote it to unicode
    418         self.charbuffer = ""
    419         self.linebuffer = None
    420 
    421     def decode(self, input, errors='strict'):
    422         raise NotImplementedError
    423 
    424     def read(self, size=-1, chars=-1, firstline=False):
    425 
    426         """ Decodes data from the stream self.stream and returns the
    427             resulting object.
    428 
    429             chars indicates the number of characters to read from the
    430             stream. read() will never return more than chars
    431             characters, but it might return less, if there are not enough
    432             characters available.
    433 
    434             size indicates the approximate maximum number of bytes to
    435             read from the stream for decoding purposes. The decoder
    436             can modify this setting as appropriate. The default value
    437             -1 indicates to read and decode as much as possible.  size
    438             is intended to prevent having to decode huge files in one
    439             step.
    440 
    441             If firstline is true, and a UnicodeDecodeError happens
    442             after the first line terminator in the input only the first line
    443             will be returned, the rest of the input will be kept until the
    444             next call to read().
    445 
    446             The method should use a greedy read strategy meaning that
    447             it should read as much data as is allowed within the
    448             definition of the encoding and the given size, e.g.  if
    449             optional encoding endings or state markers are available
    450             on the stream, these should be read too.
    451         """
    452         # If we have lines cached, first merge them back into characters
    453         if self.linebuffer:
    454             self.charbuffer = "".join(self.linebuffer)
    455             self.linebuffer = None
    456 
    457         # read until we get the required number of characters (if available)
    458         while True:
    459             # can the request can be satisfied from the character buffer?
    460             if chars < 0:
    461                 if size < 0:
    462                     if self.charbuffer:
    463                         break
    464                 elif len(self.charbuffer) >= size:
    465                     break
    466             else:
    467                 if len(self.charbuffer) >= chars:
    468                     break
    469             # we need more data
    470             if size < 0:
    471                 newdata = self.stream.read()
    472             else:
    473                 newdata = self.stream.read(size)
    474             # decode bytes (those remaining from the last call included)
    475             data = self.bytebuffer + newdata
    476             try:
    477                 newchars, decodedbytes = self.decode(data, self.errors)
    478             except UnicodeDecodeError, exc:
    479                 if firstline:
    480                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
    481                     lines = newchars.splitlines(True)
    482                     if len(lines)<=1:
    483                         raise
    484                 else:
    485                     raise
    486             # keep undecoded bytes until the next call
    487             self.bytebuffer = data[decodedbytes:]
    488             # put new characters in the character buffer
    489             self.charbuffer += newchars
    490             # there was no data available
    491             if not newdata:
    492                 break
    493         if chars < 0:
    494             # Return everything we've got
    495             result = self.charbuffer
    496             self.charbuffer = ""
    497         else:
    498             # Return the first chars characters
    499             result = self.charbuffer[:chars]
    500             self.charbuffer = self.charbuffer[chars:]
    501         return result
    502 
    503     def readline(self, size=None, keepends=True):
    504 
    505         """ Read one line from the input stream and return the
    506             decoded data.
    507 
    508             size, if given, is passed as size argument to the
    509             read() method.
    510 
    511         """
    512         # If we have lines cached from an earlier read, return
    513         # them unconditionally
    514         if self.linebuffer:
    515             line = self.linebuffer[0]
    516             del self.linebuffer[0]
    517             if len(self.linebuffer) == 1:
    518                 # revert to charbuffer mode; we might need more data
    519                 # next time
    520                 self.charbuffer = self.linebuffer[0]
    521                 self.linebuffer = None
    522             if not keepends:
    523                 line = line.splitlines(False)[0]
    524             return line
    525 
    526         readsize = size or 72
    527         line = ""
    528         # If size is given, we call read() only once
    529         while True:
    530             data = self.read(readsize, firstline=True)
    531             if data:
    532                 # If we're at a "\r" read one extra character (which might
    533                 # be a "\n") to get a proper line ending. If the stream is
    534                 # temporarily exhausted we return the wrong line ending.
    535                 if data.endswith("\r"):
    536                     data += self.read(size=1, chars=1)
    537 
    538             line += data
    539             lines = line.splitlines(True)
    540             if lines:
    541                 if len(lines) > 1:
    542                     # More than one line result; the first line is a full line
    543                     # to return
    544                     line = lines[0]
    545                     del lines[0]
    546                     if len(lines) > 1:
    547                         # cache the remaining lines
    548                         lines[-1] += self.charbuffer
    549                         self.linebuffer = lines
    550                         self.charbuffer = None
    551                     else:
    552                         # only one remaining line, put it back into charbuffer
    553                         self.charbuffer = lines[0] + self.charbuffer
    554                     if not keepends:
    555                         line = line.splitlines(False)[0]
    556                     break
    557                 line0withend = lines[0]
    558                 line0withoutend = lines[0].splitlines(False)[0]
    559                 if line0withend != line0withoutend: # We really have a line end
    560                     # Put the rest back together and keep it until the next call
    561                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
    562                     if keepends:
    563                         line = line0withend
    564                     else:
    565                         line = line0withoutend
    566                     break
    567             # we didn't get anything or this was our only try
    568             if not data or size is not None:
    569                 if line and not keepends:
    570                     line = line.splitlines(False)[0]
    571                 break
    572             if readsize<8000:
    573                 readsize *= 2
    574         return line
    575 
    576     def readlines(self, sizehint=None, keepends=True):
    577 
    578         """ Read all lines available on the input stream
    579             and return them as list of lines.
    580 
    581             Line breaks are implemented using the codec's decoder
    582             method and are included in the list entries.
    583 
    584             sizehint, if given, is ignored since there is no efficient
    585             way to finding the true end-of-line.
    586 
    587         """
    588         data = self.read()
    589         return data.splitlines(keepends)
    590 
    591     def reset(self):
    592 
    593         """ Resets the codec buffers used for keeping state.
    594 
    595             Note that no stream repositioning should take place.
    596             This method is primarily intended to be able to recover
    597             from decoding errors.
    598 
    599         """
    600         self.bytebuffer = ""
    601         self.charbuffer = u""
    602         self.linebuffer = None
    603 
    604     def seek(self, offset, whence=0):
    605         """ Set the input stream's current position.
    606 
    607             Resets the codec buffers used for keeping state.
    608         """
    609         self.stream.seek(offset, whence)
    610         self.reset()
    611 
    612     def next(self):
    613 
    614         """ Return the next decoded line from the input stream."""
    615         line = self.readline()
    616         if line:
    617             return line
    618         raise StopIteration
    619 
    620     def __iter__(self):
    621         return self
    622 
    623     def __getattr__(self, name,
    624                     getattr=getattr):
    625 
    626         """ Inherit all other methods from the underlying stream.
    627         """
    628         return getattr(self.stream, name)
    629 
    630     def __enter__(self):
    631         return self
    632 
    633     def __exit__(self, type, value, tb):
    634         self.stream.close()
    635 
    636 ###
    637 
    638 class StreamReaderWriter:
    639 
    640     """ StreamReaderWriter instances allow wrapping streams which
    641         work in both read and write modes.
    642 
    643         The design is such that one can use the factory functions
    644         returned by the codec.lookup() function to construct the
    645         instance.
    646 
    647     """
    648     # Optional attributes set by the file wrappers below
    649     encoding = 'unknown'
    650 
    651     def __init__(self, stream, Reader, Writer, errors='strict'):
    652 
    653         """ Creates a StreamReaderWriter instance.
    654 
    655             stream must be a Stream-like object.
    656 
    657             Reader, Writer must be factory functions or classes
    658             providing the StreamReader, StreamWriter interface resp.
    659 
    660             Error handling is done in the same way as defined for the
    661             StreamWriter/Readers.
    662 
    663         """
    664         self.stream = stream
    665         self.reader = Reader(stream, errors)
    666         self.writer = Writer(stream, errors)
    667         self.errors = errors
    668 
    669     def read(self, size=-1):
    670 
    671         return self.reader.read(size)
    672 
    673     def readline(self, size=None):
    674 
    675         return self.reader.readline(size)
    676 
    677     def readlines(self, sizehint=None):
    678 
    679         return self.reader.readlines(sizehint)
    680 
    681     def next(self):
    682 
    683         """ Return the next decoded line from the input stream."""
    684         return self.reader.next()
    685 
    686     def __iter__(self):
    687         return self
    688 
    689     def write(self, data):
    690 
    691         return self.writer.write(data)
    692 
    693     def writelines(self, list):
    694 
    695         return self.writer.writelines(list)
    696 
    697     def reset(self):
    698 
    699         self.reader.reset()
    700         self.writer.reset()
    701 
    702     def seek(self, offset, whence=0):
    703         self.stream.seek(offset, whence)
    704         self.reader.reset()
    705         if whence == 0 and offset == 0:
    706             self.writer.reset()
    707 
    708     def __getattr__(self, name,
    709                     getattr=getattr):
    710 
    711         """ Inherit all other methods from the underlying stream.
    712         """
    713         return getattr(self.stream, name)
    714 
    715     # these are needed to make "with codecs.open(...)" work properly
    716 
    717     def __enter__(self):
    718         return self
    719 
    720     def __exit__(self, type, value, tb):
    721         self.stream.close()
    722 
    723 ###
    724 
    725 class StreamRecoder:
    726 
    727     """ StreamRecoder instances provide a frontend - backend
    728         view of encoding data.
    729 
    730         They use the complete set of APIs returned by the
    731         codecs.lookup() function to implement their task.
    732 
    733         Data written to the stream is first decoded into an
    734         intermediate format (which is dependent on the given codec
    735         combination) and then written to the stream using an instance
    736         of the provided Writer class.
    737 
    738         In the other direction, data is read from the stream using a
    739         Reader instance and then return encoded data to the caller.
    740 
    741     """
    742     # Optional attributes set by the file wrappers below
    743     data_encoding = 'unknown'
    744     file_encoding = 'unknown'
    745 
    746     def __init__(self, stream, encode, decode, Reader, Writer,
    747                  errors='strict'):
    748 
    749         """ Creates a StreamRecoder instance which implements a two-way
    750             conversion: encode and decode work on the frontend (the
    751             input to .read() and output of .write()) while
    752             Reader and Writer work on the backend (reading and
    753             writing to the stream).
    754 
    755             You can use these objects to do transparent direct
    756             recodings from e.g. latin-1 to utf-8 and back.
    757 
    758             stream must be a file-like object.
    759 
    760             encode, decode must adhere to the Codec interface, Reader,
    761             Writer must be factory functions or classes providing the
    762             StreamReader, StreamWriter interface resp.
    763 
    764             encode and decode are needed for the frontend translation,
    765             Reader and Writer for the backend translation. Unicode is
    766             used as intermediate encoding.
    767 
    768             Error handling is done in the same way as defined for the
    769             StreamWriter/Readers.
    770 
    771         """
    772         self.stream = stream
    773         self.encode = encode
    774         self.decode = decode
    775         self.reader = Reader(stream, errors)
    776         self.writer = Writer(stream, errors)
    777         self.errors = errors
    778 
    779     def read(self, size=-1):
    780 
    781         data = self.reader.read(size)
    782         data, bytesencoded = self.encode(data, self.errors)
    783         return data
    784 
    785     def readline(self, size=None):
    786 
    787         if size is None:
    788             data = self.reader.readline()
    789         else:
    790             data = self.reader.readline(size)
    791         data, bytesencoded = self.encode(data, self.errors)
    792         return data
    793 
    794     def readlines(self, sizehint=None):
    795 
    796         data = self.reader.read()
    797         data, bytesencoded = self.encode(data, self.errors)
    798         return data.splitlines(1)
    799 
    800     def next(self):
    801 
    802         """ Return the next decoded line from the input stream."""
    803         data = self.reader.next()
    804         data, bytesencoded = self.encode(data, self.errors)
    805         return data
    806 
    807     def __iter__(self):
    808         return self
    809 
    810     def write(self, data):
    811 
    812         data, bytesdecoded = self.decode(data, self.errors)
    813         return self.writer.write(data)
    814 
    815     def writelines(self, list):
    816 
    817         data = ''.join(list)
    818         data, bytesdecoded = self.decode(data, self.errors)
    819         return self.writer.write(data)
    820 
    821     def reset(self):
    822 
    823         self.reader.reset()
    824         self.writer.reset()
    825 
    826     def __getattr__(self, name,
    827                     getattr=getattr):
    828 
    829         """ Inherit all other methods from the underlying stream.
    830         """
    831         return getattr(self.stream, name)
    832 
    833     def __enter__(self):
    834         return self
    835 
    836     def __exit__(self, type, value, tb):
    837         self.stream.close()
    838 
    839 ### Shortcuts
    840 
    841 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
    842 
    843     """ Open an encoded file using the given mode and return
    844         a wrapped version providing transparent encoding/decoding.
    845 
    846         Note: The wrapped version will only accept the object format
    847         defined by the codecs, i.e. Unicode objects for most builtin
    848         codecs. Output is also codec dependent and will usually be
    849         Unicode as well.
    850 
    851         Files are always opened in binary mode, even if no binary mode
    852         was specified. This is done to avoid data loss due to encodings
    853         using 8-bit values. The default file mode is 'rb' meaning to
    854         open the file in binary read mode.
    855 
    856         encoding specifies the encoding which is to be used for the
    857         file.
    858 
    859         errors may be given to define the error handling. It defaults
    860         to 'strict' which causes ValueErrors to be raised in case an
    861         encoding error occurs.
    862 
    863         buffering has the same meaning as for the builtin open() API.
    864         It defaults to line buffered.
    865 
    866         The returned wrapped file object provides an extra attribute
    867         .encoding which allows querying the used encoding. This
    868         attribute is only available if an encoding was specified as
    869         parameter.
    870 
    871     """
    872     if encoding is not None:
    873         if 'U' in mode:
    874             # No automatic conversion of '\n' is done on reading and writing
    875             mode = mode.strip().replace('U', '')
    876             if mode[:1] not in set('rwa'):
    877                 mode = 'r' + mode
    878         if 'b' not in mode:
    879             # Force opening of the file in binary mode
    880             mode = mode + 'b'
    881     file = __builtin__.open(filename, mode, buffering)
    882     if encoding is None:
    883         return file
    884     info = lookup(encoding)
    885     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    886     # Add attributes to simplify introspection
    887     srw.encoding = encoding
    888     return srw
    889 
    890 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    891 
    892     """ Return a wrapped version of file which provides transparent
    893         encoding translation.
    894 
    895         Strings written to the wrapped file are interpreted according
    896         to the given data_encoding and then written to the original
    897         file as string using file_encoding. The intermediate encoding
    898         will usually be Unicode but depends on the specified codecs.
    899 
    900         Strings are read from the file using file_encoding and then
    901         passed back to the caller as string using data_encoding.
    902 
    903         If file_encoding is not given, it defaults to data_encoding.
    904 
    905         errors may be given to define the error handling. It defaults
    906         to 'strict' which causes ValueErrors to be raised in case an
    907         encoding error occurs.
    908 
    909         The returned wrapped file object provides two extra attributes
    910         .data_encoding and .file_encoding which reflect the given
    911         parameters of the same name. The attributes can be used for
    912         introspection by Python programs.
    913 
    914     """
    915     if file_encoding is None:
    916         file_encoding = data_encoding
    917     data_info = lookup(data_encoding)
    918     file_info = lookup(file_encoding)
    919     sr = StreamRecoder(file, data_info.encode, data_info.decode,
    920                        file_info.streamreader, file_info.streamwriter, errors)
    921     # Add attributes to simplify introspection
    922     sr.data_encoding = data_encoding
    923     sr.file_encoding = file_encoding
    924     return sr
    925 
    926 ### Helpers for codec lookup
    927 
    928 def getencoder(encoding):
    929 
    930     """ Lookup up the codec for the given encoding and return
    931         its encoder function.
    932 
    933         Raises a LookupError in case the encoding cannot be found.
    934 
    935     """
    936     return lookup(encoding).encode
    937 
    938 def getdecoder(encoding):
    939 
    940     """ Lookup up the codec for the given encoding and return
    941         its decoder function.
    942 
    943         Raises a LookupError in case the encoding cannot be found.
    944 
    945     """
    946     return lookup(encoding).decode
    947 
    948 def getincrementalencoder(encoding):
    949 
    950     """ Lookup up the codec for the given encoding and return
    951         its IncrementalEncoder class or factory function.
    952 
    953         Raises a LookupError in case the encoding cannot be found
    954         or the codecs doesn't provide an incremental encoder.
    955 
    956     """
    957     encoder = lookup(encoding).incrementalencoder
    958     if encoder is None:
    959         raise LookupError(encoding)
    960     return encoder
    961 
    962 def getincrementaldecoder(encoding):
    963 
    964     """ Lookup up the codec for the given encoding and return
    965         its IncrementalDecoder class or factory function.
    966 
    967         Raises a LookupError in case the encoding cannot be found
    968         or the codecs doesn't provide an incremental decoder.
    969 
    970     """
    971     decoder = lookup(encoding).incrementaldecoder
    972     if decoder is None:
    973         raise LookupError(encoding)
    974     return decoder
    975 
    976 def getreader(encoding):
    977 
    978     """ Lookup up the codec for the given encoding and return
    979         its StreamReader class or factory function.
    980 
    981         Raises a LookupError in case the encoding cannot be found.
    982 
    983     """
    984     return lookup(encoding).streamreader
    985 
    986 def getwriter(encoding):
    987 
    988     """ Lookup up the codec for the given encoding and return
    989         its StreamWriter class or factory function.
    990 
    991         Raises a LookupError in case the encoding cannot be found.
    992 
    993     """
    994     return lookup(encoding).streamwriter
    995 
    996 def iterencode(iterator, encoding, errors='strict', **kwargs):
    997     """
    998     Encoding iterator.
    999 
   1000     Encodes the input strings from the iterator using a IncrementalEncoder.
   1001 
   1002     errors and kwargs are passed through to the IncrementalEncoder
   1003     constructor.
   1004     """
   1005     encoder = getincrementalencoder(encoding)(errors, **kwargs)
   1006     for input in iterator:
   1007         output = encoder.encode(input)
   1008         if output:
   1009             yield output
   1010     output = encoder.encode("", True)
   1011     if output:
   1012         yield output
   1013 
   1014 def iterdecode(iterator, encoding, errors='strict', **kwargs):
   1015     """
   1016     Decoding iterator.
   1017 
   1018     Decodes the input strings from the iterator using a IncrementalDecoder.
   1019 
   1020     errors and kwargs are passed through to the IncrementalDecoder
   1021     constructor.
   1022     """
   1023     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
   1024     for input in iterator:
   1025         output = decoder.decode(input)
   1026         if output:
   1027             yield output
   1028     output = decoder.decode("", True)
   1029     if output:
   1030         yield output
   1031 
   1032 ### Helpers for charmap-based codecs
   1033 
   1034 def make_identity_dict(rng):
   1035 
   1036     """ make_identity_dict(rng) -> dict
   1037 
   1038         Return a dictionary where elements of the rng sequence are
   1039         mapped to themselves.
   1040 
   1041     """
   1042     res = {}
   1043     for i in rng:
   1044         res[i]=i
   1045     return res
   1046 
   1047 def make_encoding_map(decoding_map):
   1048 
   1049     """ Creates an encoding map from a decoding map.
   1050 
   1051         If a target mapping in the decoding map occurs multiple
   1052         times, then that target is mapped to None (undefined mapping),
   1053         causing an exception when encountered by the charmap codec
   1054         during translation.
   1055 
   1056         One example where this happens is cp875.py which decodes
   1057         multiple character to \u001a.
   1058 
   1059     """
   1060     m = {}
   1061     for k,v in decoding_map.items():
   1062         if not v in m:
   1063             m[v] = k
   1064         else:
   1065             m[v] = None
   1066     return m
   1067 
   1068 ### error handlers
   1069 
   1070 try:
   1071     strict_errors = lookup_error("strict")
   1072     ignore_errors = lookup_error("ignore")
   1073     replace_errors = lookup_error("replace")
   1074     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
   1075     backslashreplace_errors = lookup_error("backslashreplace")
   1076 except LookupError:
   1077     # In --disable-unicode builds, these error handler are missing
   1078     strict_errors = None
   1079     ignore_errors = None
   1080     replace_errors = None
   1081     xmlcharrefreplace_errors = None
   1082     backslashreplace_errors = None
   1083 
   1084 # Tell modulefinder that using codecs probably needs the encodings
   1085 # package
   1086 _false = 0
   1087 if _false:
   1088     import encodings
   1089 
   1090 ### Tests
   1091 
   1092 if __name__ == '__main__':
   1093 
   1094     # Make stdout translate Latin-1 output into UTF-8 output
   1095     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
   1096 
   1097     # Have stdin translate Latin-1 input into UTF-8 input
   1098     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
   1099