Home | History | Annotate | Download | only in html5lib
      1 from __future__ import absolute_import, division, unicode_literals
      2 from six import text_type
      3 from six.moves import http_client
      4 
      5 import codecs
      6 import re
      7 
      8 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
      9 from .constants import encodings, ReparseException
     10 from . import utils
     11 
     12 from io import StringIO
     13 
     14 try:
     15     from io import BytesIO
     16 except ImportError:
     17     BytesIO = StringIO
     18 
     19 try:
     20     from io import BufferedIOBase
     21 except ImportError:
     22     class BufferedIOBase(object):
     23         pass
     24 
     25 # Non-unicode versions of constants for use in the pre-parser
     26 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
     27 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
     28 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
     29 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
     30 
     31 
     32 invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
     33 
     34 if utils.supports_lone_surrogates:
     35     # Use one extra step of indirection and create surrogates with
     36     # unichr. Not using this indirection would introduce an illegal
     37     # unicode literal on platforms not supporting such lone
     38     # surrogates.
     39     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
     40                                     eval('"\\uD800-\\uDFFF"'))
     41 else:
     42     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
     43 
     44 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
     45                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
     46                                   0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
     47                                   0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
     48                                   0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
     49                                   0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
     50                                   0x10FFFE, 0x10FFFF])
     51 
     52 ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
     53 
     54 # Cache for charsUntil()
     55 charsUntilRegEx = {}
     56 
     57 
     58 class BufferedStream(object):
     59     """Buffering for streams that do not have buffering of their own
     60 
     61     The buffer is implemented as a list of chunks on the assumption that
     62     joining many strings will be slow since it is O(n**2)
     63     """
     64 
     65     def __init__(self, stream):
     66         self.stream = stream
     67         self.buffer = []
     68         self.position = [-1, 0]  # chunk number, offset
     69 
     70     def tell(self):
     71         pos = 0
     72         for chunk in self.buffer[:self.position[0]]:
     73             pos += len(chunk)
     74         pos += self.position[1]
     75         return pos
     76 
     77     def seek(self, pos):
     78         assert pos <= self._bufferedBytes()
     79         offset = pos
     80         i = 0
     81         while len(self.buffer[i]) < offset:
     82             offset -= len(self.buffer[i])
     83             i += 1
     84         self.position = [i, offset]
     85 
     86     def read(self, bytes):
     87         if not self.buffer:
     88             return self._readStream(bytes)
     89         elif (self.position[0] == len(self.buffer) and
     90               self.position[1] == len(self.buffer[-1])):
     91             return self._readStream(bytes)
     92         else:
     93             return self._readFromBuffer(bytes)
     94 
     95     def _bufferedBytes(self):
     96         return sum([len(item) for item in self.buffer])
     97 
     98     def _readStream(self, bytes):
     99         data = self.stream.read(bytes)
    100         self.buffer.append(data)
    101         self.position[0] += 1
    102         self.position[1] = len(data)
    103         return data
    104 
    105     def _readFromBuffer(self, bytes):
    106         remainingBytes = bytes
    107         rv = []
    108         bufferIndex = self.position[0]
    109         bufferOffset = self.position[1]
    110         while bufferIndex < len(self.buffer) and remainingBytes != 0:
    111             assert remainingBytes > 0
    112             bufferedData = self.buffer[bufferIndex]
    113 
    114             if remainingBytes <= len(bufferedData) - bufferOffset:
    115                 bytesToRead = remainingBytes
    116                 self.position = [bufferIndex, bufferOffset + bytesToRead]
    117             else:
    118                 bytesToRead = len(bufferedData) - bufferOffset
    119                 self.position = [bufferIndex, len(bufferedData)]
    120                 bufferIndex += 1
    121             rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
    122             remainingBytes -= bytesToRead
    123 
    124             bufferOffset = 0
    125 
    126         if remainingBytes:
    127             rv.append(self._readStream(remainingBytes))
    128 
    129         return b"".join(rv)
    130 
    131 
    132 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
    133     if isinstance(source, http_client.HTTPResponse):
    134         # Work around Python bug #20007: read(0) closes the connection.
    135         # http://bugs.python.org/issue20007
    136         isUnicode = False
    137     elif hasattr(source, "read"):
    138         isUnicode = isinstance(source.read(0), text_type)
    139     else:
    140         isUnicode = isinstance(source, text_type)
    141 
    142     if isUnicode:
    143         if encoding is not None:
    144             raise TypeError("Cannot explicitly set an encoding with a unicode string")
    145 
    146         return HTMLUnicodeInputStream(source)
    147     else:
    148         return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
    149 
    150 
    151 class HTMLUnicodeInputStream(object):
    152     """Provides a unicode stream of characters to the HTMLTokenizer.
    153 
    154     This class takes care of character encoding and removing or replacing
    155     incorrect byte-sequences and also provides column and line tracking.
    156 
    157     """
    158 
    159     _defaultChunkSize = 10240
    160 
    161     def __init__(self, source):
    162         """Initialises the HTMLInputStream.
    163 
    164         HTMLInputStream(source, [encoding]) -> Normalized stream from source
    165         for use by html5lib.
    166 
    167         source can be either a file-object, local filename or a string.
    168 
    169         The optional encoding parameter must be a string that indicates
    170         the encoding.  If specified, that encoding will be used,
    171         regardless of any BOM or later declaration (such as in a meta
    172         element)
    173 
    174         parseMeta - Look for a <meta> element containing encoding information
    175 
    176         """
    177 
    178         if not utils.supports_lone_surrogates:
    179             # Such platforms will have already checked for such
    180             # surrogate errors, so no need to do this checking.
    181             self.reportCharacterErrors = None
    182             self.replaceCharactersRegexp = None
    183         elif len("\U0010FFFF") == 1:
    184             self.reportCharacterErrors = self.characterErrorsUCS4
    185             self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
    186         else:
    187             self.reportCharacterErrors = self.characterErrorsUCS2
    188             self.replaceCharactersRegexp = re.compile(
    189                 eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
    190 
    191         # List of where new lines occur
    192         self.newLines = [0]
    193 
    194         self.charEncoding = ("utf-8", "certain")
    195         self.dataStream = self.openStream(source)
    196 
    197         self.reset()
    198 
    199     def reset(self):
    200         self.chunk = ""
    201         self.chunkSize = 0
    202         self.chunkOffset = 0
    203         self.errors = []
    204 
    205         # number of (complete) lines in previous chunks
    206         self.prevNumLines = 0
    207         # number of columns in the last line of the previous chunk
    208         self.prevNumCols = 0
    209 
    210         # Deal with CR LF and surrogates split over chunk boundaries
    211         self._bufferedCharacter = None
    212 
    213     def openStream(self, source):
    214         """Produces a file object from source.
    215 
    216         source can be either a file object, local filename or a string.
    217 
    218         """
    219         # Already a file object
    220         if hasattr(source, 'read'):
    221             stream = source
    222         else:
    223             stream = StringIO(source)
    224 
    225         return stream
    226 
    227     def _position(self, offset):
    228         chunk = self.chunk
    229         nLines = chunk.count('\n', 0, offset)
    230         positionLine = self.prevNumLines + nLines
    231         lastLinePos = chunk.rfind('\n', 0, offset)
    232         if lastLinePos == -1:
    233             positionColumn = self.prevNumCols + offset
    234         else:
    235             positionColumn = offset - (lastLinePos + 1)
    236         return (positionLine, positionColumn)
    237 
    238     def position(self):
    239         """Returns (line, col) of the current position in the stream."""
    240         line, col = self._position(self.chunkOffset)
    241         return (line + 1, col)
    242 
    243     def char(self):
    244         """ Read one character from the stream or queue if available. Return
    245             EOF when EOF is reached.
    246         """
    247         # Read a new chunk from the input stream if necessary
    248         if self.chunkOffset >= self.chunkSize:
    249             if not self.readChunk():
    250                 return EOF
    251 
    252         chunkOffset = self.chunkOffset
    253         char = self.chunk[chunkOffset]
    254         self.chunkOffset = chunkOffset + 1
    255 
    256         return char
    257 
    258     def readChunk(self, chunkSize=None):
    259         if chunkSize is None:
    260             chunkSize = self._defaultChunkSize
    261 
    262         self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
    263 
    264         self.chunk = ""
    265         self.chunkSize = 0
    266         self.chunkOffset = 0
    267 
    268         data = self.dataStream.read(chunkSize)
    269 
    270         # Deal with CR LF and surrogates broken across chunks
    271         if self._bufferedCharacter:
    272             data = self._bufferedCharacter + data
    273             self._bufferedCharacter = None
    274         elif not data:
    275             # We have no more data, bye-bye stream
    276             return False
    277 
    278         if len(data) > 1:
    279             lastv = ord(data[-1])
    280             if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
    281                 self._bufferedCharacter = data[-1]
    282                 data = data[:-1]
    283 
    284         if self.reportCharacterErrors:
    285             self.reportCharacterErrors(data)
    286 
    287             # Replace invalid characters
    288             # Note U+0000 is dealt with in the tokenizer
    289             data = self.replaceCharactersRegexp.sub("\ufffd", data)
    290 
    291         data = data.replace("\r\n", "\n")
    292         data = data.replace("\r", "\n")
    293 
    294         self.chunk = data
    295         self.chunkSize = len(data)
    296 
    297         return True
    298 
    299     def characterErrorsUCS4(self, data):
    300         for i in range(len(invalid_unicode_re.findall(data))):
    301             self.errors.append("invalid-codepoint")
    302 
    303     def characterErrorsUCS2(self, data):
    304         # Someone picked the wrong compile option
    305         # You lose
    306         skip = False
    307         for match in invalid_unicode_re.finditer(data):
    308             if skip:
    309                 continue
    310             codepoint = ord(match.group())
    311             pos = match.start()
    312             # Pretty sure there should be endianness issues here
    313             if utils.isSurrogatePair(data[pos:pos + 2]):
    314                 # We have a surrogate pair!
    315                 char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
    316                 if char_val in non_bmp_invalid_codepoints:
    317                     self.errors.append("invalid-codepoint")
    318                 skip = True
    319             elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
    320                   pos == len(data) - 1):
    321                 self.errors.append("invalid-codepoint")
    322             else:
    323                 skip = False
    324                 self.errors.append("invalid-codepoint")
    325 
    326     def charsUntil(self, characters, opposite=False):
    327         """ Returns a string of characters from the stream up to but not
    328         including any character in 'characters' or EOF. 'characters' must be
    329         a container that supports the 'in' method and iteration over its
    330         characters.
    331         """
    332 
    333         # Use a cache of regexps to find the required characters
    334         try:
    335             chars = charsUntilRegEx[(characters, opposite)]
    336         except KeyError:
    337             if __debug__:
    338                 for c in characters:
    339                     assert(ord(c) < 128)
    340             regex = "".join(["\\x%02x" % ord(c) for c in characters])
    341             if not opposite:
    342                 regex = "^%s" % regex
    343             chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
    344 
    345         rv = []
    346 
    347         while True:
    348             # Find the longest matching prefix
    349             m = chars.match(self.chunk, self.chunkOffset)
    350             if m is None:
    351                 # If nothing matched, and it wasn't because we ran out of chunk,
    352                 # then stop
    353                 if self.chunkOffset != self.chunkSize:
    354                     break
    355             else:
    356                 end = m.end()
    357                 # If not the whole chunk matched, return everything
    358                 # up to the part that didn't match
    359                 if end != self.chunkSize:
    360                     rv.append(self.chunk[self.chunkOffset:end])
    361                     self.chunkOffset = end
    362                     break
    363             # If the whole remainder of the chunk matched,
    364             # use it all and read the next chunk
    365             rv.append(self.chunk[self.chunkOffset:])
    366             if not self.readChunk():
    367                 # Reached EOF
    368                 break
    369 
    370         r = "".join(rv)
    371         return r
    372 
    373     def unget(self, char):
    374         # Only one character is allowed to be ungotten at once - it must
    375         # be consumed again before any further call to unget
    376         if char is not None:
    377             if self.chunkOffset == 0:
    378                 # unget is called quite rarely, so it's a good idea to do
    379                 # more work here if it saves a bit of work in the frequently
    380                 # called char and charsUntil.
    381                 # So, just prepend the ungotten character onto the current
    382                 # chunk:
    383                 self.chunk = char + self.chunk
    384                 self.chunkSize += 1
    385             else:
    386                 self.chunkOffset -= 1
    387                 assert self.chunk[self.chunkOffset] == char
    388 
    389 
    390 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    391     """Provides a unicode stream of characters to the HTMLTokenizer.
    392 
    393     This class takes care of character encoding and removing or replacing
    394     incorrect byte-sequences and also provides column and line tracking.
    395 
    396     """
    397 
    398     def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
    399         """Initialises the HTMLInputStream.
    400 
    401         HTMLInputStream(source, [encoding]) -> Normalized stream from source
    402         for use by html5lib.
    403 
    404         source can be either a file-object, local filename or a string.
    405 
    406         The optional encoding parameter must be a string that indicates
    407         the encoding.  If specified, that encoding will be used,
    408         regardless of any BOM or later declaration (such as in a meta
    409         element)
    410 
    411         parseMeta - Look for a <meta> element containing encoding information
    412 
    413         """
    414         # Raw Stream - for unicode objects this will encode to utf-8 and set
    415         #              self.charEncoding as appropriate
    416         self.rawStream = self.openStream(source)
    417 
    418         HTMLUnicodeInputStream.__init__(self, self.rawStream)
    419 
    420         self.charEncoding = (codecName(encoding), "certain")
    421 
    422         # Encoding Information
    423         # Number of bytes to use when looking for a meta element with
    424         # encoding information
    425         self.numBytesMeta = 512
    426         # Number of bytes to use when using detecting encoding using chardet
    427         self.numBytesChardet = 100
    428         # Encoding to use if no other information can be found
    429         self.defaultEncoding = "windows-1252"
    430 
    431         # Detect encoding iff no explicit "transport level" encoding is supplied
    432         if (self.charEncoding[0] is None):
    433             self.charEncoding = self.detectEncoding(parseMeta, chardet)
    434 
    435         # Call superclass
    436         self.reset()
    437 
    438     def reset(self):
    439         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
    440                                                                  'replace')
    441         HTMLUnicodeInputStream.reset(self)
    442 
    443     def openStream(self, source):
    444         """Produces a file object from source.
    445 
    446         source can be either a file object, local filename or a string.
    447 
    448         """
    449         # Already a file object
    450         if hasattr(source, 'read'):
    451             stream = source
    452         else:
    453             stream = BytesIO(source)
    454 
    455         try:
    456             stream.seek(stream.tell())
    457         except:
    458             stream = BufferedStream(stream)
    459 
    460         return stream
    461 
    462     def detectEncoding(self, parseMeta=True, chardet=True):
    463         # First look for a BOM
    464         # This will also read past the BOM if present
    465         encoding = self.detectBOM()
    466         confidence = "certain"
    467         # If there is no BOM need to look for meta elements with encoding
    468         # information
    469         if encoding is None and parseMeta:
    470             encoding = self.detectEncodingMeta()
    471             confidence = "tentative"
    472         # Guess with chardet, if avaliable
    473         if encoding is None and chardet:
    474             confidence = "tentative"
    475             try:
    476                 try:
    477                     from charade.universaldetector import UniversalDetector
    478                 except ImportError:
    479                     from chardet.universaldetector import UniversalDetector
    480                 buffers = []
    481                 detector = UniversalDetector()
    482                 while not detector.done:
    483                     buffer = self.rawStream.read(self.numBytesChardet)
    484                     assert isinstance(buffer, bytes)
    485                     if not buffer:
    486                         break
    487                     buffers.append(buffer)
    488                     detector.feed(buffer)
    489                 detector.close()
    490                 encoding = detector.result['encoding']
    491                 self.rawStream.seek(0)
    492             except ImportError:
    493                 pass
    494         # If all else fails use the default encoding
    495         if encoding is None:
    496             confidence = "tentative"
    497             encoding = self.defaultEncoding
    498 
    499         # Substitute for equivalent encodings:
    500         encodingSub = {"iso-8859-1": "windows-1252"}
    501 
    502         if encoding.lower() in encodingSub:
    503             encoding = encodingSub[encoding.lower()]
    504 
    505         return encoding, confidence
    506 
    507     def changeEncoding(self, newEncoding):
    508         assert self.charEncoding[1] != "certain"
    509         newEncoding = codecName(newEncoding)
    510         if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
    511             newEncoding = "utf-8"
    512         if newEncoding is None:
    513             return
    514         elif newEncoding == self.charEncoding[0]:
    515             self.charEncoding = (self.charEncoding[0], "certain")
    516         else:
    517             self.rawStream.seek(0)
    518             self.reset()
    519             self.charEncoding = (newEncoding, "certain")
    520             raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
    521 
    522     def detectBOM(self):
    523         """Attempts to detect at BOM at the start of the stream. If
    524         an encoding can be determined from the BOM return the name of the
    525         encoding otherwise return None"""
    526         bomDict = {
    527             codecs.BOM_UTF8: 'utf-8',
    528             codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
    529             codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
    530         }
    531 
    532         # Go to beginning of file and read in 4 bytes
    533         string = self.rawStream.read(4)
    534         assert isinstance(string, bytes)
    535 
    536         # Try detecting the BOM using bytes from the string
    537         encoding = bomDict.get(string[:3])         # UTF-8
    538         seek = 3
    539         if not encoding:
    540             # Need to detect UTF-32 before UTF-16
    541             encoding = bomDict.get(string)         # UTF-32
    542             seek = 4
    543             if not encoding:
    544                 encoding = bomDict.get(string[:2])  # UTF-16
    545                 seek = 2
    546 
    547         # Set the read position past the BOM if one was found, otherwise
    548         # set it to the start of the stream
    549         self.rawStream.seek(encoding and seek or 0)
    550 
    551         return encoding
    552 
    553     def detectEncodingMeta(self):
    554         """Report the encoding declared by the meta element
    555         """
    556         buffer = self.rawStream.read(self.numBytesMeta)
    557         assert isinstance(buffer, bytes)
    558         parser = EncodingParser(buffer)
    559         self.rawStream.seek(0)
    560         encoding = parser.getEncoding()
    561 
    562         if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
    563             encoding = "utf-8"
    564 
    565         return encoding
    566 
    567 
    568 class EncodingBytes(bytes):
    569     """String-like object with an associated position and various extra methods
    570     If the position is ever greater than the string length then an exception is
    571     raised"""
    572     def __new__(self, value):
    573         assert isinstance(value, bytes)
    574         return bytes.__new__(self, value.lower())
    575 
    576     def __init__(self, value):
    577         self._position = -1
    578 
    579     def __iter__(self):
    580         return self
    581 
    582     def __next__(self):
    583         p = self._position = self._position + 1
    584         if p >= len(self):
    585             raise StopIteration
    586         elif p < 0:
    587             raise TypeError
    588         return self[p:p + 1]
    589 
    590     def next(self):
    591         # Py2 compat
    592         return self.__next__()
    593 
    594     def previous(self):
    595         p = self._position
    596         if p >= len(self):
    597             raise StopIteration
    598         elif p < 0:
    599             raise TypeError
    600         self._position = p = p - 1
    601         return self[p:p + 1]
    602 
    603     def setPosition(self, position):
    604         if self._position >= len(self):
    605             raise StopIteration
    606         self._position = position
    607 
    608     def getPosition(self):
    609         if self._position >= len(self):
    610             raise StopIteration
    611         if self._position >= 0:
    612             return self._position
    613         else:
    614             return None
    615 
    616     position = property(getPosition, setPosition)
    617 
    618     def getCurrentByte(self):
    619         return self[self.position:self.position + 1]
    620 
    621     currentByte = property(getCurrentByte)
    622 
    623     def skip(self, chars=spaceCharactersBytes):
    624         """Skip past a list of characters"""
    625         p = self.position               # use property for the error-checking
    626         while p < len(self):
    627             c = self[p:p + 1]
    628             if c not in chars:
    629                 self._position = p
    630                 return c
    631             p += 1
    632         self._position = p
    633         return None
    634 
    635     def skipUntil(self, chars):
    636         p = self.position
    637         while p < len(self):
    638             c = self[p:p + 1]
    639             if c in chars:
    640                 self._position = p
    641                 return c
    642             p += 1
    643         self._position = p
    644         return None
    645 
    646     def matchBytes(self, bytes):
    647         """Look for a sequence of bytes at the start of a string. If the bytes
    648         are found return True and advance the position to the byte after the
    649         match. Otherwise return False and leave the position alone"""
    650         p = self.position
    651         data = self[p:p + len(bytes)]
    652         rv = data.startswith(bytes)
    653         if rv:
    654             self.position += len(bytes)
    655         return rv
    656 
    657     def jumpTo(self, bytes):
    658         """Look for the next sequence of bytes matching a given sequence. If
    659         a match is found advance the position to the last byte of the match"""
    660         newPosition = self[self.position:].find(bytes)
    661         if newPosition > -1:
    662             # XXX: This is ugly, but I can't see a nicer way to fix this.
    663             if self._position == -1:
    664                 self._position = 0
    665             self._position += (newPosition + len(bytes) - 1)
    666             return True
    667         else:
    668             raise StopIteration
    669 
    670 
    671 class EncodingParser(object):
    672     """Mini parser for detecting character encoding from meta elements"""
    673 
    674     def __init__(self, data):
    675         """string - the data to work on for encoding detection"""
    676         self.data = EncodingBytes(data)
    677         self.encoding = None
    678 
    679     def getEncoding(self):
    680         methodDispatch = (
    681             (b"<!--", self.handleComment),
    682             (b"<meta", self.handleMeta),
    683             (b"</", self.handlePossibleEndTag),
    684             (b"<!", self.handleOther),
    685             (b"<?", self.handleOther),
    686             (b"<", self.handlePossibleStartTag))
    687         for byte in self.data:
    688             keepParsing = True
    689             for key, method in methodDispatch:
    690                 if self.data.matchBytes(key):
    691                     try:
    692                         keepParsing = method()
    693                         break
    694                     except StopIteration:
    695                         keepParsing = False
    696                         break
    697             if not keepParsing:
    698                 break
    699 
    700         return self.encoding
    701 
    702     def handleComment(self):
    703         """Skip over comments"""
    704         return self.data.jumpTo(b"-->")
    705 
    706     def handleMeta(self):
    707         if self.data.currentByte not in spaceCharactersBytes:
    708             # if we have <meta not followed by a space so just keep going
    709             return True
    710         # We have a valid meta element we want to search for attributes
    711         hasPragma = False
    712         pendingEncoding = None
    713         while True:
    714             # Try to find the next attribute after the current position
    715             attr = self.getAttribute()
    716             if attr is None:
    717                 return True
    718             else:
    719                 if attr[0] == b"http-equiv":
    720                     hasPragma = attr[1] == b"content-type"
    721                     if hasPragma and pendingEncoding is not None:
    722                         self.encoding = pendingEncoding
    723                         return False
    724                 elif attr[0] == b"charset":
    725                     tentativeEncoding = attr[1]
    726                     codec = codecName(tentativeEncoding)
    727                     if codec is not None:
    728                         self.encoding = codec
    729                         return False
    730                 elif attr[0] == b"content":
    731                     contentParser = ContentAttrParser(EncodingBytes(attr[1]))
    732                     tentativeEncoding = contentParser.parse()
    733                     if tentativeEncoding is not None:
    734                         codec = codecName(tentativeEncoding)
    735                         if codec is not None:
    736                             if hasPragma:
    737                                 self.encoding = codec
    738                                 return False
    739                             else:
    740                                 pendingEncoding = codec
    741 
    742     def handlePossibleStartTag(self):
    743         return self.handlePossibleTag(False)
    744 
    745     def handlePossibleEndTag(self):
    746         next(self.data)
    747         return self.handlePossibleTag(True)
    748 
    749     def handlePossibleTag(self, endTag):
    750         data = self.data
    751         if data.currentByte not in asciiLettersBytes:
    752             # If the next byte is not an ascii letter either ignore this
    753             # fragment (possible start tag case) or treat it according to
    754             # handleOther
    755             if endTag:
    756                 data.previous()
    757                 self.handleOther()
    758             return True
    759 
    760         c = data.skipUntil(spacesAngleBrackets)
    761         if c == b"<":
    762             # return to the first step in the overall "two step" algorithm
    763             # reprocessing the < byte
    764             data.previous()
    765         else:
    766             # Read all attributes
    767             attr = self.getAttribute()
    768             while attr is not None:
    769                 attr = self.getAttribute()
    770         return True
    771 
    772     def handleOther(self):
    773         return self.data.jumpTo(b">")
    774 
    775     def getAttribute(self):
    776         """Return a name,value pair for the next attribute in the stream,
    777         if one is found, or None"""
    778         data = self.data
    779         # Step 1 (skip chars)
    780         c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
    781         assert c is None or len(c) == 1
    782         # Step 2
    783         if c in (b">", None):
    784             return None
    785         # Step 3
    786         attrName = []
    787         attrValue = []
    788         # Step 4 attribute name
    789         while True:
    790             if c == b"=" and attrName:
    791                 break
    792             elif c in spaceCharactersBytes:
    793                 # Step 6!
    794                 c = data.skip()
    795                 break
    796             elif c in (b"/", b">"):
    797                 return b"".join(attrName), b""
    798             elif c in asciiUppercaseBytes:
    799                 attrName.append(c.lower())
    800             elif c is None:
    801                 return None
    802             else:
    803                 attrName.append(c)
    804             # Step 5
    805             c = next(data)
    806         # Step 7
    807         if c != b"=":
    808             data.previous()
    809             return b"".join(attrName), b""
    810         # Step 8
    811         next(data)
    812         # Step 9
    813         c = data.skip()
    814         # Step 10
    815         if c in (b"'", b'"'):
    816             # 10.1
    817             quoteChar = c
    818             while True:
    819                 # 10.2
    820                 c = next(data)
    821                 # 10.3
    822                 if c == quoteChar:
    823                     next(data)
    824                     return b"".join(attrName), b"".join(attrValue)
    825                 # 10.4
    826                 elif c in asciiUppercaseBytes:
    827                     attrValue.append(c.lower())
    828                 # 10.5
    829                 else:
    830                     attrValue.append(c)
    831         elif c == b">":
    832             return b"".join(attrName), b""
    833         elif c in asciiUppercaseBytes:
    834             attrValue.append(c.lower())
    835         elif c is None:
    836             return None
    837         else:
    838             attrValue.append(c)
    839         # Step 11
    840         while True:
    841             c = next(data)
    842             if c in spacesAngleBrackets:
    843                 return b"".join(attrName), b"".join(attrValue)
    844             elif c in asciiUppercaseBytes:
    845                 attrValue.append(c.lower())
    846             elif c is None:
    847                 return None
    848             else:
    849                 attrValue.append(c)
    850 
    851 
    852 class ContentAttrParser(object):
    853     def __init__(self, data):
    854         assert isinstance(data, bytes)
    855         self.data = data
    856 
    857     def parse(self):
    858         try:
    859             # Check if the attr name is charset
    860             # otherwise return
    861             self.data.jumpTo(b"charset")
    862             self.data.position += 1
    863             self.data.skip()
    864             if not self.data.currentByte == b"=":
    865                 # If there is no = sign keep looking for attrs
    866                 return None
    867             self.data.position += 1
    868             self.data.skip()
    869             # Look for an encoding between matching quote marks
    870             if self.data.currentByte in (b'"', b"'"):
    871                 quoteMark = self.data.currentByte
    872                 self.data.position += 1
    873                 oldPosition = self.data.position
    874                 if self.data.jumpTo(quoteMark):
    875                     return self.data[oldPosition:self.data.position]
    876                 else:
    877                     return None
    878             else:
    879                 # Unquoted value
    880                 oldPosition = self.data.position
    881                 try:
    882                     self.data.skipUntil(spaceCharactersBytes)
    883                     return self.data[oldPosition:self.data.position]
    884                 except StopIteration:
    885                     # Return the whole remaining value
    886                     return self.data[oldPosition:]
    887         except StopIteration:
    888             return None
    889 
    890 
    891 def codecName(encoding):
    892     """Return the python codec name corresponding to an encoding or None if the
    893     string doesn't correspond to a valid encoding."""
    894     if isinstance(encoding, bytes):
    895         try:
    896             encoding = encoding.decode("ascii")
    897         except UnicodeDecodeError:
    898             return None
    899     if encoding:
    900         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
    901         return encodings.get(canonicalName, None)
    902     else:
    903         return None
    904