Home | History | Annotate | Download | only in encodings
      1 """ Python 'utf-8-sig' Codec
      2 This work similar to UTF-8 with the following changes:
      3 
      4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
      5   first three bytes.
      6 
      7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
      8   bytes will be skipped.
      9 """
     10 import codecs
     11 
     12 ### Codec APIs

     13 
     14 def encode(input, errors='strict'):
     15     return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
     16 
     17 def decode(input, errors='strict'):
     18     prefix = 0
     19     if input[:3] == codecs.BOM_UTF8:
     20         input = input[3:]
     21         prefix = 3
     22     (output, consumed) = codecs.utf_8_decode(input, errors, True)
     23     return (output, consumed+prefix)
     24 
     25 class IncrementalEncoder(codecs.IncrementalEncoder):
     26     def __init__(self, errors='strict'):
     27         codecs.IncrementalEncoder.__init__(self, errors)
     28         self.first = 1
     29 
     30     def encode(self, input, final=False):
     31         if self.first:
     32             self.first = 0
     33             return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
     34         else:
     35             return codecs.utf_8_encode(input, self.errors)[0]
     36 
     37     def reset(self):
     38         codecs.IncrementalEncoder.reset(self)
     39         self.first = 1
     40 
     41     def getstate(self):
     42         return self.first
     43 
     44     def setstate(self, state):
     45         self.first = state
     46 
     47 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     48     def __init__(self, errors='strict'):
     49         codecs.BufferedIncrementalDecoder.__init__(self, errors)
     50         self.first = True
     51 
     52     def _buffer_decode(self, input, errors, final):
     53         if self.first:
     54             if len(input) < 3:
     55                 if codecs.BOM_UTF8.startswith(input):
     56                     # not enough data to decide if this really is a BOM

     57                     # => try again on the next call

     58                     return (u"", 0)
     59                 else:
     60                     self.first = None
     61             else:
     62                 self.first = None
     63                 if input[:3] == codecs.BOM_UTF8:
     64                     (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
     65                     return (output, consumed+3)
     66         return codecs.utf_8_decode(input, errors, final)
     67 
     68     def reset(self):
     69         codecs.BufferedIncrementalDecoder.reset(self)
     70         self.first = True
     71 
     72 class StreamWriter(codecs.StreamWriter):
     73     def reset(self):
     74         codecs.StreamWriter.reset(self)
     75         try:
     76             del self.encode
     77         except AttributeError:
     78             pass
     79 
     80     def encode(self, input, errors='strict'):
     81         self.encode = codecs.utf_8_encode
     82         return encode(input, errors)
     83 
     84 class StreamReader(codecs.StreamReader):
     85     def reset(self):
     86         codecs.StreamReader.reset(self)
     87         try:
     88             del self.decode
     89         except AttributeError:
     90             pass
     91 
     92     def decode(self, input, errors='strict'):
     93         if len(input) < 3:
     94             if codecs.BOM_UTF8.startswith(input):
     95                 # not enough data to decide if this is a BOM

     96                 # => try again on the next call

     97                 return (u"", 0)
     98         elif input[:3] == codecs.BOM_UTF8:
     99             self.decode = codecs.utf_8_decode
    100             (output, consumed) = codecs.utf_8_decode(input[3:],errors)
    101             return (output, consumed+3)
    102         # (else) no BOM present

    103         self.decode = codecs.utf_8_decode
    104         return codecs.utf_8_decode(input, errors)
    105 
    106 ### encodings module API

    107 
    108 def getregentry():
    109     return codecs.CodecInfo(
    110         name='utf-8-sig',
    111         encode=encode,
    112         decode=decode,
    113         incrementalencoder=IncrementalEncoder,
    114         incrementaldecoder=IncrementalDecoder,
    115         streamreader=StreamReader,
    116         streamwriter=StreamWriter,
    117     )
    118