Home | History | Annotate | Download | only in Lib
      1 """ codecs -- Python Codec Registry, API and helpers.
      2 
      3 
      4 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"

      9 
     10 import __builtin__, sys
     11 
     12 ### Registry and builtin stateless codec functions

     13 
     14 try:
     15     from _codecs import *
     16 except ImportError, why:
     17     raise SystemError('Failed to load the builtin codecs: %s' % why)
     18 
     19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
     20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
     21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
     22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
     23            "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
     24            "StreamReader", "StreamWriter",
     25            "StreamReaderWriter", "StreamRecoder",
     26            "getencoder", "getdecoder", "getincrementalencoder",
     27            "getincrementaldecoder", "getreader", "getwriter",
     28            "encode", "decode", "iterencode", "iterdecode",
     29            "strict_errors", "ignore_errors", "replace_errors",
     30            "xmlcharrefreplace_errors", "backslashreplace_errors",
     31            "register_error", "lookup_error"]
     32 
     33 ### Constants

     34 
     35 #

     36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)

     37 # and its possible byte string values

     38 # for UTF8/UTF16/UTF32 output and little/big endian machines

     39 #

     40 
     41 # UTF-8

     42 BOM_UTF8 = '\xef\xbb\xbf'
     43 
     44 # UTF-16, little endian

     45 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
     46 
     47 # UTF-16, big endian

     48 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
     49 
     50 # UTF-32, little endian

     51 BOM_UTF32_LE = '\xff\xfe\x00\x00'
     52 
     53 # UTF-32, big endian

     54 BOM_UTF32_BE = '\x00\x00\xfe\xff'
     55 
     56 if sys.byteorder == 'little':
     57 
     58     # UTF-16, native endianness

     59     BOM = BOM_UTF16 = BOM_UTF16_LE
     60 
     61     # UTF-32, native endianness

     62     BOM_UTF32 = BOM_UTF32_LE
     63 
     64 else:
     65 
     66     # UTF-16, native endianness

     67     BOM = BOM_UTF16 = BOM_UTF16_BE
     68 
     69     # UTF-32, native endianness

     70     BOM_UTF32 = BOM_UTF32_BE
     71 
     72 # Old broken names (don't use in new code)

     73 BOM32_LE = BOM_UTF16_LE
     74 BOM32_BE = BOM_UTF16_BE
     75 BOM64_LE = BOM_UTF32_LE
     76 BOM64_BE = BOM_UTF32_BE
     77 
     78 
     79 ### Codec base classes (defining the API)

     80 
     81 class CodecInfo(tuple):
     82 
     83     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
     84         incrementalencoder=None, incrementaldecoder=None, name=None):
     85         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
     86         self.name = name
     87         self.encode = encode
     88         self.decode = decode
     89         self.incrementalencoder = incrementalencoder
     90         self.incrementaldecoder = incrementaldecoder
     91         self.streamwriter = streamwriter
     92         self.streamreader = streamreader
     93         return self
     94 
     95     def __repr__(self):
     96         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
     97 
     98 class Codec:
     99 
    100     """ Defines the interface for stateless encoders/decoders.
    101 
    102         The .encode()/.decode() methods may use different error
    103         handling schemes by providing the errors argument. These
    104         string values are predefined:
    105 
    106          'strict' - raise a ValueError error (or a subclass)
    107          'ignore' - ignore the character and continue with the next
    108          'replace' - replace with a suitable replacement character;
    109                     Python will use the official U+FFFD REPLACEMENT
    110                     CHARACTER for the builtin Unicode codecs on
    111                     decoding and '?' on encoding.
    112          'xmlcharrefreplace' - Replace with the appropriate XML
    113                                character reference (only for encoding).
    114          'backslashreplace'  - Replace with backslashed escape sequences
    115                                (only for encoding).
    116 
    117         The set of allowed values can be extended via register_error.
    118 
    119     """
    120     def encode(self, input, errors='strict'):
    121 
    122         """ Encodes the object input and returns a tuple (output
    123             object, length consumed).
    124 
    125             errors defines the error handling to apply. It defaults to
    126             'strict' handling.
    127 
    128             The method may not store state in the Codec instance. Use
    129             StreamCodec for codecs which have to keep state in order to
    130             make encoding/decoding efficient.
    131 
    132             The encoder must be able to handle zero length input and
    133             return an empty object of the output object type in this
    134             situation.
    135 
    136         """
    137         raise NotImplementedError
    138 
    139     def decode(self, input, errors='strict'):
    140 
    141         """ Decodes the object input and returns a tuple (output
    142             object, length consumed).
    143 
    144             input must be an object which provides the bf_getreadbuf
    145             buffer slot. Python strings, buffer objects and memory
    146             mapped files are examples of objects providing this slot.
    147 
    148             errors defines the error handling to apply. It defaults to
    149             'strict' handling.
    150 
    151             The method may not store state in the Codec instance. Use
    152             StreamCodec for codecs which have to keep state in order to
    153             make encoding/decoding efficient.
    154 
    155             The decoder must be able to handle zero length input and
    156             return an empty object of the output object type in this
    157             situation.
    158 
    159         """
    160         raise NotImplementedError
    161 
    162 class IncrementalEncoder(object):
    163     """
    164     An IncrementalEncoder encodes an input in multiple steps. The input can be
    165     passed piece by piece to the encode() method. The IncrementalEncoder remembers
    166     the state of the Encoding process between calls to encode().
    167     """
    168     def __init__(self, errors='strict'):
    169         """
    170         Creates an IncrementalEncoder instance.
    171 
    172         The IncrementalEncoder may use different error handling schemes by
    173         providing the errors keyword argument. See the module docstring
    174         for a list of possible values.
    175         """
    176         self.errors = errors
    177         self.buffer = ""
    178 
    179     def encode(self, input, final=False):
    180         """
    181         Encodes input and returns the resulting object.
    182         """
    183         raise NotImplementedError
    184 
    185     def reset(self):
    186         """
    187         Resets the encoder to the initial state.
    188         """
    189 
    190     def getstate(self):
    191         """
    192         Return the current state of the encoder.
    193         """
    194         return 0
    195 
    196     def setstate(self, state):
    197         """
    198         Set the current state of the encoder. state must have been
    199         returned by getstate().
    200         """
    201 
    202 class BufferedIncrementalEncoder(IncrementalEncoder):
    203     """
    204     This subclass of IncrementalEncoder can be used as the baseclass for an
    205     incremental encoder if the encoder must keep some of the output in a
    206     buffer between calls to encode().
    207     """
    208     def __init__(self, errors='strict'):
    209         IncrementalEncoder.__init__(self, errors)
    210         self.buffer = "" # unencoded input that is kept between calls to encode()

    211 
    212     def _buffer_encode(self, input, errors, final):
    213         # Overwrite this method in subclasses: It must encode input

    214         # and return an (output, length consumed) tuple

    215         raise NotImplementedError
    216 
    217     def encode(self, input, final=False):
    218         # encode input (taking the buffer into account)

    219         data = self.buffer + input
    220         (result, consumed) = self._buffer_encode(data, self.errors, final)
    221         # keep unencoded input until the next call

    222         self.buffer = data[consumed:]
    223         return result
    224 
    225     def reset(self):
    226         IncrementalEncoder.reset(self)
    227         self.buffer = ""
    228 
    229     def getstate(self):
    230         return self.buffer or 0
    231 
    232     def setstate(self, state):
    233         self.buffer = state or ""
    234 
    235 class IncrementalDecoder(object):
    236     """
    237     An IncrementalDecoder decodes an input in multiple steps. The input can be
    238     passed piece by piece to the decode() method. The IncrementalDecoder
    239     remembers the state of the decoding process between calls to decode().
    240     """
    241     def __init__(self, errors='strict'):
    242         """
    243         Creates a IncrementalDecoder instance.
    244 
    245         The IncrementalDecoder may use different error handling schemes by
    246         providing the errors keyword argument. See the module docstring
    247         for a list of possible values.
    248         """
    249         self.errors = errors
    250 
    251     def decode(self, input, final=False):
    252         """
    253         Decodes input and returns the resulting object.
    254         """
    255         raise NotImplementedError
    256 
    257     def reset(self):
    258         """
    259         Resets the decoder to the initial state.
    260         """
    261 
    262     def getstate(self):
    263         """
    264         Return the current state of the decoder.
    265 
    266         This must be a (buffered_input, additional_state_info) tuple.
    267         buffered_input must be a bytes object containing bytes that
    268         were passed to decode() that have not yet been converted.
    269         additional_state_info must be a non-negative integer
    270         representing the state of the decoder WITHOUT yet having
    271         processed the contents of buffered_input.  In the initial state
    272         and after reset(), getstate() must return (b"", 0).
    273         """
    274         return (b"", 0)
    275 
    276     def setstate(self, state):
    277         """
    278         Set the current state of the decoder.
    279 
    280         state must have been returned by getstate().  The effect of
    281         setstate((b"", 0)) must be equivalent to reset().
    282         """
    283 
    284 class BufferedIncrementalDecoder(IncrementalDecoder):
    285     """
    286     This subclass of IncrementalDecoder can be used as the baseclass for an
    287     incremental decoder if the decoder must be able to handle incomplete byte
    288     sequences.
    289     """
    290     def __init__(self, errors='strict'):
    291         IncrementalDecoder.__init__(self, errors)
    292         self.buffer = "" # undecoded input that is kept between calls to decode()

    293 
    294     def _buffer_decode(self, input, errors, final):
    295         # Overwrite this method in subclasses: It must decode input

    296         # and return an (output, length consumed) tuple

    297         raise NotImplementedError
    298 
    299     def decode(self, input, final=False):
    300         # decode input (taking the buffer into account)

    301         data = self.buffer + input
    302         (result, consumed) = self._buffer_decode(data, self.errors, final)
    303         # keep undecoded input until the next call

    304         self.buffer = data[consumed:]
    305         return result
    306 
    307     def reset(self):
    308         IncrementalDecoder.reset(self)
    309         self.buffer = ""
    310 
    311     def getstate(self):
    312         # additional state info is always 0

    313         return (self.buffer, 0)
    314 
    315     def setstate(self, state):
    316         # ignore additional state info

    317         self.buffer = state[0]
    318 
    319 #

    320 # The StreamWriter and StreamReader class provide generic working

    321 # interfaces which can be used to implement new encoding submodules

    322 # very easily. See encodings/utf_8.py for an example on how this is

    323 # done.

    324 #

    325 
    326 class StreamWriter(Codec):
    327 
    328     def __init__(self, stream, errors='strict'):
    329 
    330         """ Creates a StreamWriter instance.
    331 
    332             stream must be a file-like object open for writing
    333             (binary) data.
    334 
    335             The StreamWriter may use different error handling
    336             schemes by providing the errors keyword argument. These
    337             parameters are predefined:
    338 
    339              'strict' - raise a ValueError (or a subclass)
    340              'ignore' - ignore the character and continue with the next
    341              'replace'- replace with a suitable replacement character
    342              'xmlcharrefreplace' - Replace with the appropriate XML
    343                                    character reference.
    344              'backslashreplace'  - Replace with backslashed escape
    345                                    sequences (only for encoding).
    346 
    347             The set of allowed parameter values can be extended via
    348             register_error.
    349         """
    350         self.stream = stream
    351         self.errors = errors
    352 
    353     def write(self, object):
    354 
    355         """ Writes the object's contents encoded to self.stream.
    356         """
    357         data, consumed = self.encode(object, self.errors)
    358         self.stream.write(data)
    359 
    360     def writelines(self, list):
    361 
    362         """ Writes the concatenated list of strings to the stream
    363             using .write().
    364         """
    365         self.write(''.join(list))
    366 
    367     def reset(self):
    368 
    369         """ Flushes and resets the codec buffers used for keeping state.
    370 
    371             Calling this method should ensure that the data on the
    372             output is put into a clean state, that allows appending
    373             of new fresh data without having to rescan the whole
    374             stream to recover state.
    375 
    376         """
    377         pass
    378 
    379     def seek(self, offset, whence=0):
    380         self.stream.seek(offset, whence)
    381         if whence == 0 and offset == 0:
    382             self.reset()
    383 
    384     def __getattr__(self, name,
    385                     getattr=getattr):
    386 
    387         """ Inherit all other methods from the underlying stream.
    388         """
    389         return getattr(self.stream, name)
    390 
    391     def __enter__(self):
    392         return self
    393 
    394     def __exit__(self, type, value, tb):
    395         self.stream.close()
    396 
    397 ###

    398 
    399 class StreamReader(Codec):
    400 
    401     def __init__(self, stream, errors='strict'):
    402 
    403         """ Creates a StreamReader instance.
    404 
    405             stream must be a file-like object open for reading
    406             (binary) data.
    407 
    408             The StreamReader may use different error handling
    409             schemes by providing the errors keyword argument. These
    410             parameters are predefined:
    411 
    412              'strict' - raise a ValueError (or a subclass)
    413              'ignore' - ignore the character and continue with the next
    414              'replace'- replace with a suitable replacement character;
    415 
    416             The set of allowed parameter values can be extended via
    417             register_error.
    418         """
    419         self.stream = stream
    420         self.errors = errors
    421         self.bytebuffer = ""
    422         # For str->str decoding this will stay a str

    423         # For str->unicode decoding the first read will promote it to unicode

    424         self.charbuffer = ""
    425         self.linebuffer = None
    426 
    427     def decode(self, input, errors='strict'):
    428         raise NotImplementedError
    429 
    430     def read(self, size=-1, chars=-1, firstline=False):
    431 
    432         """ Decodes data from the stream self.stream and returns the
    433             resulting object.
    434 
    435             chars indicates the number of characters to read from the
    436             stream. read() will never return more than chars
    437             characters, but it might return less, if there are not enough
    438             characters available.
    439 
    440             size indicates the approximate maximum number of bytes to
    441             read from the stream for decoding purposes. The decoder
    442             can modify this setting as appropriate. The default value
    443             -1 indicates to read and decode as much as possible.  size
    444             is intended to prevent having to decode huge files in one
    445             step.
    446 
    447             If firstline is true, and a UnicodeDecodeError happens
    448             after the first line terminator in the input only the first line
    449             will be returned, the rest of the input will be kept until the
    450             next call to read().
    451 
    452             The method should use a greedy read strategy meaning that
    453             it should read as much data as is allowed within the
    454             definition of the encoding and the given size, e.g.  if
    455             optional encoding endings or state markers are available
    456             on the stream, these should be read too.
    457         """
    458         # If we have lines cached, first merge them back into characters

    459         if self.linebuffer:
    460             self.charbuffer = "".join(self.linebuffer)
    461             self.linebuffer = None
    462 
    463         # read until we get the required number of characters (if available)

    464         while True:
    465             # can the request be satisfied from the character buffer?

    466             if chars >= 0:
    467                 if len(self.charbuffer) >= chars:
    468                     break
    469             elif size >= 0:
    470                 if len(self.charbuffer) >= size:
    471                     break
    472             # we need more data

    473             if size < 0:
    474                 newdata = self.stream.read()
    475             else:
    476                 newdata = self.stream.read(size)
    477             # decode bytes (those remaining from the last call included)

    478             data = self.bytebuffer + newdata
    479             try:
    480                 newchars, decodedbytes = self.decode(data, self.errors)
    481             except UnicodeDecodeError, exc:
    482                 if firstline:
    483                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
    484                     lines = newchars.splitlines(True)
    485                     if len(lines)<=1:
    486                         raise
    487                 else:
    488                     raise
    489             # keep undecoded bytes until the next call

    490             self.bytebuffer = data[decodedbytes:]
    491             # put new characters in the character buffer

    492             self.charbuffer += newchars
    493             # there was no data available

    494             if not newdata:
    495                 break
    496         if chars < 0:
    497             # Return everything we've got

    498             result = self.charbuffer
    499             self.charbuffer = ""
    500         else:
    501             # Return the first chars characters

    502             result = self.charbuffer[:chars]
    503             self.charbuffer = self.charbuffer[chars:]
    504         return result
    505 
    506     def readline(self, size=None, keepends=True):
    507 
    508         """ Read one line from the input stream and return the
    509             decoded data.
    510 
    511             size, if given, is passed as size argument to the
    512             read() method.
    513 
    514         """
    515         # If we have lines cached from an earlier read, return

    516         # them unconditionally

    517         if self.linebuffer:
    518             line = self.linebuffer[0]
    519             del self.linebuffer[0]
    520             if len(self.linebuffer) == 1:
    521                 # revert to charbuffer mode; we might need more data

    522                 # next time

    523                 self.charbuffer = self.linebuffer[0]
    524                 self.linebuffer = None
    525             if not keepends:
    526                 line = line.splitlines(False)[0]
    527             return line
    528 
    529         readsize = size or 72
    530         line = ""
    531         # If size is given, we call read() only once

    532         while True:
    533             data = self.read(readsize, firstline=True)
    534             if data:
    535                 # If we're at a "\r" read one extra character (which might

    536                 # be a "\n") to get a proper line ending. If the stream is

    537                 # temporarily exhausted we return the wrong line ending.

    538                 if data.endswith("\r"):
    539                     data += self.read(size=1, chars=1)
    540 
    541             line += data
    542             lines = line.splitlines(True)
    543             if lines:
    544                 if len(lines) > 1:
    545                     # More than one line result; the first line is a full line

    546                     # to return

    547                     line = lines[0]
    548                     del lines[0]
    549                     if len(lines) > 1:
    550                         # cache the remaining lines

    551                         lines[-1] += self.charbuffer
    552                         self.linebuffer = lines
    553                         self.charbuffer = None
    554                     else:
    555                         # only one remaining line, put it back into charbuffer

    556                         self.charbuffer = lines[0] + self.charbuffer
    557                     if not keepends:
    558                         line = line.splitlines(False)[0]
    559                     break
    560                 line0withend = lines[0]
    561                 line0withoutend = lines[0].splitlines(False)[0]
    562                 if line0withend != line0withoutend: # We really have a line end

    563                     # Put the rest back together and keep it until the next call

    564                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
    565                     if keepends:
    566                         line = line0withend
    567                     else:
    568                         line = line0withoutend
    569                     break
    570             # we didn't get anything or this was our only try

    571             if not data or size is not None:
    572                 if line and not keepends:
    573                     line = line.splitlines(False)[0]
    574                 break
    575             if readsize<8000:
    576                 readsize *= 2
    577         return line
    578 
    579     def readlines(self, sizehint=None, keepends=True):
    580 
    581         """ Read all lines available on the input stream
    582             and return them as list of lines.
    583 
    584             Line breaks are implemented using the codec's decoder
    585             method and are included in the list entries.
    586 
    587             sizehint, if given, is ignored since there is no efficient
    588             way to finding the true end-of-line.
    589 
    590         """
    591         data = self.read()
    592         return data.splitlines(keepends)
    593 
    594     def reset(self):
    595 
    596         """ Resets the codec buffers used for keeping state.
    597 
    598             Note that no stream repositioning should take place.
    599             This method is primarily intended to be able to recover
    600             from decoding errors.
    601 
    602         """
    603         self.bytebuffer = ""
    604         self.charbuffer = u""
    605         self.linebuffer = None
    606 
    607     def seek(self, offset, whence=0):
    608         """ Set the input stream's current position.
    609 
    610             Resets the codec buffers used for keeping state.
    611         """
    612         self.stream.seek(offset, whence)
    613         self.reset()
    614 
    615     def next(self):
    616 
    617         """ Return the next decoded line from the input stream."""
    618         line = self.readline()
    619         if line:
    620             return line
    621         raise StopIteration
    622 
    623     def __iter__(self):
    624         return self
    625 
    626     def __getattr__(self, name,
    627                     getattr=getattr):
    628 
    629         """ Inherit all other methods from the underlying stream.
    630         """
    631         return getattr(self.stream, name)
    632 
    633     def __enter__(self):
    634         return self
    635 
    636     def __exit__(self, type, value, tb):
    637         self.stream.close()
    638 
    639 ###

    640 
    641 class StreamReaderWriter:
    642 
    643     """ StreamReaderWriter instances allow wrapping streams which
    644         work in both read and write modes.
    645 
    646         The design is such that one can use the factory functions
    647         returned by the codec.lookup() function to construct the
    648         instance.
    649 
    650     """
    651     # Optional attributes set by the file wrappers below

    652     encoding = 'unknown'
    653 
    654     def __init__(self, stream, Reader, Writer, errors='strict'):
    655 
    656         """ Creates a StreamReaderWriter instance.
    657 
    658             stream must be a Stream-like object.
    659 
    660             Reader, Writer must be factory functions or classes
    661             providing the StreamReader, StreamWriter interface resp.
    662 
    663             Error handling is done in the same way as defined for the
    664             StreamWriter/Readers.
    665 
    666         """
    667         self.stream = stream
    668         self.reader = Reader(stream, errors)
    669         self.writer = Writer(stream, errors)
    670         self.errors = errors
    671 
    672     def read(self, size=-1):
    673 
    674         return self.reader.read(size)
    675 
    676     def readline(self, size=None):
    677 
    678         return self.reader.readline(size)
    679 
    680     def readlines(self, sizehint=None):
    681 
    682         return self.reader.readlines(sizehint)
    683 
    684     def next(self):
    685 
    686         """ Return the next decoded line from the input stream."""
    687         return self.reader.next()
    688 
    689     def __iter__(self):
    690         return self
    691 
    692     def write(self, data):
    693 
    694         return self.writer.write(data)
    695 
    696     def writelines(self, list):
    697 
    698         return self.writer.writelines(list)
    699 
    700     def reset(self):
    701 
    702         self.reader.reset()
    703         self.writer.reset()
    704 
    705     def seek(self, offset, whence=0):
    706         self.stream.seek(offset, whence)
    707         self.reader.reset()
    708         if whence == 0 and offset == 0:
    709             self.writer.reset()
    710 
    711     def __getattr__(self, name,
    712                     getattr=getattr):
    713 
    714         """ Inherit all other methods from the underlying stream.
    715         """
    716         return getattr(self.stream, name)
    717 
    718     # these are needed to make "with codecs.open(...)" work properly

    719 
    720     def __enter__(self):
    721         return self
    722 
    723     def __exit__(self, type, value, tb):
    724         self.stream.close()
    725 
    726 ###

    727 
    728 class StreamRecoder:
    729 
    730     """ StreamRecoder instances provide a frontend - backend
    731         view of encoding data.
    732 
    733         They use the complete set of APIs returned by the
    734         codecs.lookup() function to implement their task.
    735 
    736         Data written to the stream is first decoded into an
    737         intermediate format (which is dependent on the given codec
    738         combination) and then written to the stream using an instance
    739         of the provided Writer class.
    740 
    741         In the other direction, data is read from the stream using a
    742         Reader instance and then return encoded data to the caller.
    743 
    744     """
    745     # Optional attributes set by the file wrappers below

    746     data_encoding = 'unknown'
    747     file_encoding = 'unknown'
    748 
    749     def __init__(self, stream, encode, decode, Reader, Writer,
    750                  errors='strict'):
    751 
    752         """ Creates a StreamRecoder instance which implements a two-way
    753             conversion: encode and decode work on the frontend (the
    754             input to .read() and output of .write()) while
    755             Reader and Writer work on the backend (reading and
    756             writing to the stream).
    757 
    758             You can use these objects to do transparent direct
    759             recodings from e.g. latin-1 to utf-8 and back.
    760 
    761             stream must be a file-like object.
    762 
    763             encode, decode must adhere to the Codec interface, Reader,
    764             Writer must be factory functions or classes providing the
    765             StreamReader, StreamWriter interface resp.
    766 
    767             encode and decode are needed for the frontend translation,
    768             Reader and Writer for the backend translation. Unicode is
    769             used as intermediate encoding.
    770 
    771             Error handling is done in the same way as defined for the
    772             StreamWriter/Readers.
    773 
    774         """
    775         self.stream = stream
    776         self.encode = encode
    777         self.decode = decode
    778         self.reader = Reader(stream, errors)
    779         self.writer = Writer(stream, errors)
    780         self.errors = errors
    781 
    782     def read(self, size=-1):
    783 
    784         data = self.reader.read(size)
    785         data, bytesencoded = self.encode(data, self.errors)
    786         return data
    787 
    788     def readline(self, size=None):
    789 
    790         if size is None:
    791             data = self.reader.readline()
    792         else:
    793             data = self.reader.readline(size)
    794         data, bytesencoded = self.encode(data, self.errors)
    795         return data
    796 
    797     def readlines(self, sizehint=None):
    798 
    799         data = self.reader.read()
    800         data, bytesencoded = self.encode(data, self.errors)
    801         return data.splitlines(1)
    802 
    803     def next(self):
    804 
    805         """ Return the next decoded line from the input stream."""
    806         data = self.reader.next()
    807         data, bytesencoded = self.encode(data, self.errors)
    808         return data
    809 
    810     def __iter__(self):
    811         return self
    812 
    813     def write(self, data):
    814 
    815         data, bytesdecoded = self.decode(data, self.errors)
    816         return self.writer.write(data)
    817 
    818     def writelines(self, list):
    819 
    820         data = ''.join(list)
    821         data, bytesdecoded = self.decode(data, self.errors)
    822         return self.writer.write(data)
    823 
    824     def reset(self):
    825 
    826         self.reader.reset()
    827         self.writer.reset()
    828 
    829     def __getattr__(self, name,
    830                     getattr=getattr):
    831 
    832         """ Inherit all other methods from the underlying stream.
    833         """
    834         return getattr(self.stream, name)
    835 
    836     def __enter__(self):
    837         return self
    838 
    839     def __exit__(self, type, value, tb):
    840         self.stream.close()
    841 
    842 ### Shortcuts

    843 
    844 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
    845 
    846     """ Open an encoded file using the given mode and return
    847         a wrapped version providing transparent encoding/decoding.
    848 
    849         Note: The wrapped version will only accept the object format
    850         defined by the codecs, i.e. Unicode objects for most builtin
    851         codecs. Output is also codec dependent and will usually be
    852         Unicode as well.
    853 
    854         Files are always opened in binary mode, even if no binary mode
    855         was specified. This is done to avoid data loss due to encodings
    856         using 8-bit values. The default file mode is 'rb' meaning to
    857         open the file in binary read mode.
    858 
    859         encoding specifies the encoding which is to be used for the
    860         file.
    861 
    862         errors may be given to define the error handling. It defaults
    863         to 'strict' which causes ValueErrors to be raised in case an
    864         encoding error occurs.
    865 
    866         buffering has the same meaning as for the builtin open() API.
    867         It defaults to line buffered.
    868 
    869         The returned wrapped file object provides an extra attribute
    870         .encoding which allows querying the used encoding. This
    871         attribute is only available if an encoding was specified as
    872         parameter.
    873 
    874     """
    875     if encoding is not None:
    876         if 'U' in mode:
    877             # No automatic conversion of '\n' is done on reading and writing

    878             mode = mode.strip().replace('U', '')
    879             if mode[:1] not in set('rwa'):
    880                 mode = 'r' + mode
    881         if 'b' not in mode:
    882             # Force opening of the file in binary mode

    883             mode = mode + 'b'
    884     file = __builtin__.open(filename, mode, buffering)
    885     if encoding is None:
    886         return file
    887     info = lookup(encoding)
    888     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
    889     # Add attributes to simplify introspection

    890     srw.encoding = encoding
    891     return srw
    892 
    893 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
    894 
    895     """ Return a wrapped version of file which provides transparent
    896         encoding translation.
    897 
    898         Strings written to the wrapped file are interpreted according
    899         to the given data_encoding and then written to the original
    900         file as string using file_encoding. The intermediate encoding
    901         will usually be Unicode but depends on the specified codecs.
    902 
    903         Strings are read from the file using file_encoding and then
    904         passed back to the caller as string using data_encoding.
    905 
    906         If file_encoding is not given, it defaults to data_encoding.
    907 
    908         errors may be given to define the error handling. It defaults
    909         to 'strict' which causes ValueErrors to be raised in case an
    910         encoding error occurs.
    911 
    912         The returned wrapped file object provides two extra attributes
    913         .data_encoding and .file_encoding which reflect the given
    914         parameters of the same name. The attributes can be used for
    915         introspection by Python programs.
    916 
    917     """
    918     if file_encoding is None:
    919         file_encoding = data_encoding
    920     data_info = lookup(data_encoding)
    921     file_info = lookup(file_encoding)
    922     sr = StreamRecoder(file, data_info.encode, data_info.decode,
    923                        file_info.streamreader, file_info.streamwriter, errors)
    924     # Add attributes to simplify introspection

    925     sr.data_encoding = data_encoding
    926     sr.file_encoding = file_encoding
    927     return sr
    928 
    929 ### Helpers for codec lookup

    930 
    931 def getencoder(encoding):
    932 
    933     """ Lookup up the codec for the given encoding and return
    934         its encoder function.
    935 
    936         Raises a LookupError in case the encoding cannot be found.
    937 
    938     """
    939     return lookup(encoding).encode
    940 
    941 def getdecoder(encoding):
    942 
    943     """ Lookup up the codec for the given encoding and return
    944         its decoder function.
    945 
    946         Raises a LookupError in case the encoding cannot be found.
    947 
    948     """
    949     return lookup(encoding).decode
    950 
    951 def getincrementalencoder(encoding):
    952 
    953     """ Lookup up the codec for the given encoding and return
    954         its IncrementalEncoder class or factory function.
    955 
    956         Raises a LookupError in case the encoding cannot be found
    957         or the codecs doesn't provide an incremental encoder.
    958 
    959     """
    960     encoder = lookup(encoding).incrementalencoder
    961     if encoder is None:
    962         raise LookupError(encoding)
    963     return encoder
    964 
    965 def getincrementaldecoder(encoding):
    966 
    967     """ Lookup up the codec for the given encoding and return
    968         its IncrementalDecoder class or factory function.
    969 
    970         Raises a LookupError in case the encoding cannot be found
    971         or the codecs doesn't provide an incremental decoder.
    972 
    973     """
    974     decoder = lookup(encoding).incrementaldecoder
    975     if decoder is None:
    976         raise LookupError(encoding)
    977     return decoder
    978 
    979 def getreader(encoding):
    980 
    981     """ Lookup up the codec for the given encoding and return
    982         its StreamReader class or factory function.
    983 
    984         Raises a LookupError in case the encoding cannot be found.
    985 
    986     """
    987     return lookup(encoding).streamreader
    988 
    989 def getwriter(encoding):
    990 
    991     """ Lookup up the codec for the given encoding and return
    992         its StreamWriter class or factory function.
    993 
    994         Raises a LookupError in case the encoding cannot be found.
    995 
    996     """
    997     return lookup(encoding).streamwriter
    998 
    999 def iterencode(iterator, encoding, errors='strict', **kwargs):
   1000     """
   1001     Encoding iterator.
   1002 
   1003     Encodes the input strings from the iterator using a IncrementalEncoder.
   1004 
   1005     errors and kwargs are passed through to the IncrementalEncoder
   1006     constructor.
   1007     """
   1008     encoder = getincrementalencoder(encoding)(errors, **kwargs)
   1009     for input in iterator:
   1010         output = encoder.encode(input)
   1011         if output:
   1012             yield output
   1013     output = encoder.encode("", True)
   1014     if output:
   1015         yield output
   1016 
   1017 def iterdecode(iterator, encoding, errors='strict', **kwargs):
   1018     """
   1019     Decoding iterator.
   1020 
   1021     Decodes the input strings from the iterator using a IncrementalDecoder.
   1022 
   1023     errors and kwargs are passed through to the IncrementalDecoder
   1024     constructor.
   1025     """
   1026     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
   1027     for input in iterator:
   1028         output = decoder.decode(input)
   1029         if output:
   1030             yield output
   1031     output = decoder.decode("", True)
   1032     if output:
   1033         yield output
   1034 
   1035 ### Helpers for charmap-based codecs

   1036 
   1037 def make_identity_dict(rng):
   1038 
   1039     """ make_identity_dict(rng) -> dict
   1040 
   1041         Return a dictionary where elements of the rng sequence are
   1042         mapped to themselves.
   1043 
   1044     """
   1045     res = {}
   1046     for i in rng:
   1047         res[i]=i
   1048     return res
   1049 
   1050 def make_encoding_map(decoding_map):
   1051 
   1052     """ Creates an encoding map from a decoding map.
   1053 
   1054         If a target mapping in the decoding map occurs multiple
   1055         times, then that target is mapped to None (undefined mapping),
   1056         causing an exception when encountered by the charmap codec
   1057         during translation.
   1058 
   1059         One example where this happens is cp875.py which decodes
   1060         multiple character to \\u001a.
   1061 
   1062     """
   1063     m = {}
   1064     for k,v in decoding_map.items():
   1065         if not v in m:
   1066             m[v] = k
   1067         else:
   1068             m[v] = None
   1069     return m
   1070 
   1071 ### error handlers

   1072 
   1073 try:
   1074     strict_errors = lookup_error("strict")
   1075     ignore_errors = lookup_error("ignore")
   1076     replace_errors = lookup_error("replace")
   1077     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
   1078     backslashreplace_errors = lookup_error("backslashreplace")
   1079 except LookupError:
   1080     # In --disable-unicode builds, these error handler are missing

   1081     strict_errors = None
   1082     ignore_errors = None
   1083     replace_errors = None
   1084     xmlcharrefreplace_errors = None
   1085     backslashreplace_errors = None
   1086 
   1087 # Tell modulefinder that using codecs probably needs the encodings

   1088 # package

   1089 _false = 0
   1090 if _false:
   1091     import encodings
   1092 
   1093 ### Tests

   1094 
   1095 if __name__ == '__main__':
   1096 
   1097     # Make stdout translate Latin-1 output into UTF-8 output

   1098     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
   1099 
   1100     # Have stdin translate Latin-1 input into UTF-8 input

   1101     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
   1102