Home | History | Annotate | Download | only in encodings
      1 """ Python 'utf-8-sig' Codec
      2 This work similar to UTF-8 with the following changes:
      3 
      4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
      5   first three bytes.
      6 
      7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
      8   bytes will be skipped.
      9 """
     10 import codecs
     11 
     12 ### Codec APIs
     13 
     14 def encode(input, errors='strict'):
     15     return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
     16             len(input))
     17 
     18 def decode(input, errors='strict'):
     19     prefix = 0
     20     if input[:3] == codecs.BOM_UTF8:
     21         input = input[3:]
     22         prefix = 3
     23     (output, consumed) = codecs.utf_8_decode(input, errors, True)
     24     return (output, consumed+prefix)
     25 
     26 class IncrementalEncoder(codecs.IncrementalEncoder):
     27     def __init__(self, errors='strict'):
     28         codecs.IncrementalEncoder.__init__(self, errors)
     29         self.first = 1
     30 
     31     def encode(self, input, final=False):
     32         if self.first:
     33             self.first = 0
     34             return codecs.BOM_UTF8 + \
     35                    codecs.utf_8_encode(input, self.errors)[0]
     36         else:
     37             return codecs.utf_8_encode(input, self.errors)[0]
     38 
     39     def reset(self):
     40         codecs.IncrementalEncoder.reset(self)
     41         self.first = 1
     42 
     43     def getstate(self):
     44         return self.first
     45 
     46     def setstate(self, state):
     47         self.first = state
     48 
     49 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
     50     def __init__(self, errors='strict'):
     51         codecs.BufferedIncrementalDecoder.__init__(self, errors)
     52         self.first = 1
     53 
     54     def _buffer_decode(self, input, errors, final):
     55         if self.first:
     56             if len(input) < 3:
     57                 if codecs.BOM_UTF8.startswith(input):
     58                     # not enough data to decide if this really is a BOM
     59                     # => try again on the next call
     60                     return ("", 0)
     61                 else:
     62                     self.first = 0
     63             else:
     64                 self.first = 0
     65                 if input[:3] == codecs.BOM_UTF8:
     66                     (output, consumed) = \
     67                        codecs.utf_8_decode(input[3:], errors, final)
     68                     return (output, consumed+3)
     69         return codecs.utf_8_decode(input, errors, final)
     70 
     71     def reset(self):
     72         codecs.BufferedIncrementalDecoder.reset(self)
     73         self.first = 1
     74 
     75     def getstate(self):
     76         state = codecs.BufferedIncrementalDecoder.getstate(self)
     77         # state[1] must be 0 here, as it isn't passed along to the caller
     78         return (state[0], self.first)
     79 
     80     def setstate(self, state):
     81         # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
     82         codecs.BufferedIncrementalDecoder.setstate(self, state)
     83         self.first = state[1]
     84 
     85 class StreamWriter(codecs.StreamWriter):
     86     def reset(self):
     87         codecs.StreamWriter.reset(self)
     88         try:
     89             del self.encode
     90         except AttributeError:
     91             pass
     92 
     93     def encode(self, input, errors='strict'):
     94         self.encode = codecs.utf_8_encode
     95         return encode(input, errors)
     96 
     97 class StreamReader(codecs.StreamReader):
     98     def reset(self):
     99         codecs.StreamReader.reset(self)
    100         try:
    101             del self.decode
    102         except AttributeError:
    103             pass
    104 
    105     def decode(self, input, errors='strict'):
    106         if len(input) < 3:
    107             if codecs.BOM_UTF8.startswith(input):
    108                 # not enough data to decide if this is a BOM
    109                 # => try again on the next call
    110                 return ("", 0)
    111         elif input[:3] == codecs.BOM_UTF8:
    112             self.decode = codecs.utf_8_decode
    113             (output, consumed) = codecs.utf_8_decode(input[3:],errors)
    114             return (output, consumed+3)
    115         # (else) no BOM present
    116         self.decode = codecs.utf_8_decode
    117         return codecs.utf_8_decode(input, errors)
    118 
    119 ### encodings module API
    120 
    121 def getregentry():
    122     return codecs.CodecInfo(
    123         name='utf-8-sig',
    124         encode=encode,
    125         decode=decode,
    126         incrementalencoder=IncrementalEncoder,
    127         incrementaldecoder=IncrementalDecoder,
    128         streamreader=StreamReader,
    129         streamwriter=StreamWriter,
    130     )
    131