1 """ Python 'utf-8-sig' Codec 2 This work similar to UTF-8 with the following changes: 3 4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the 5 first three bytes. 6 7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these 8 bytes will be skipped. 9 """ 10 import codecs 11 12 ### Codec APIs 13 14 def encode(input, errors='strict'): 15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) 16 17 def decode(input, errors='strict'): 18 prefix = 0 19 if input[:3] == codecs.BOM_UTF8: 20 input = input[3:] 21 prefix = 3 22 (output, consumed) = codecs.utf_8_decode(input, errors, True) 23 return (output, consumed+prefix) 24 25 class IncrementalEncoder(codecs.IncrementalEncoder): 26 def __init__(self, errors='strict'): 27 codecs.IncrementalEncoder.__init__(self, errors) 28 self.first = 1 29 30 def encode(self, input, final=False): 31 if self.first: 32 self.first = 0 33 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] 34 else: 35 return codecs.utf_8_encode(input, self.errors)[0] 36 37 def reset(self): 38 codecs.IncrementalEncoder.reset(self) 39 self.first = 1 40 41 def getstate(self): 42 return self.first 43 44 def setstate(self, state): 45 self.first = state 46 47 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 48 def __init__(self, errors='strict'): 49 codecs.BufferedIncrementalDecoder.__init__(self, errors) 50 self.first = True 51 52 def _buffer_decode(self, input, errors, final): 53 if self.first: 54 if len(input) < 3: 55 if codecs.BOM_UTF8.startswith(input): 56 # not enough data to decide if this really is a BOM 57 # => try again on the next call 58 return (u"", 0) 59 else: 60 self.first = None 61 else: 62 self.first = None 63 if input[:3] == codecs.BOM_UTF8: 64 (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) 65 return (output, consumed+3) 66 return codecs.utf_8_decode(input, errors, final) 67 68 def reset(self): 69 codecs.BufferedIncrementalDecoder.reset(self) 70 self.first = True 71 72 class StreamWriter(codecs.StreamWriter): 73 def reset(self): 74 codecs.StreamWriter.reset(self) 75 try: 76 del self.encode 77 except AttributeError: 78 pass 79 80 def encode(self, input, errors='strict'): 81 self.encode = codecs.utf_8_encode 82 return encode(input, errors) 83 84 class StreamReader(codecs.StreamReader): 85 def reset(self): 86 codecs.StreamReader.reset(self) 87 try: 88 del self.decode 89 except AttributeError: 90 pass 91 92 def decode(self, input, errors='strict'): 93 if len(input) < 3: 94 if codecs.BOM_UTF8.startswith(input): 95 # not enough data to decide if this is a BOM 96 # => try again on the next call 97 return (u"", 0) 98 elif input[:3] == codecs.BOM_UTF8: 99 self.decode = codecs.utf_8_decode 100 (output, consumed) = codecs.utf_8_decode(input[3:],errors) 101 return (output, consumed+3) 102 # (else) no BOM present 103 self.decode = codecs.utf_8_decode 104 return codecs.utf_8_decode(input, errors) 105 106 ### encodings module API 107 108 def getregentry(): 109 return codecs.CodecInfo( 110 name='utf-8-sig', 111 encode=encode, 112 decode=decode, 113 incrementalencoder=IncrementalEncoder, 114 incrementaldecoder=IncrementalDecoder, 115 streamreader=StreamReader, 116 streamwriter=StreamWriter, 117 ) 118