Home | History | Annotate | Download | only in tests
      1 from __future__ import absolute_import, division, unicode_literals
      2 
      3 import json
      4 import warnings
      5 import re
      6 
      7 from six import unichr
      8 
      9 from .support import get_data_files
     10 
     11 from html5lib.tokenizer import HTMLTokenizer
     12 from html5lib import constants, utils
     13 
     14 
     15 class TokenizerTestParser(object):
     16     def __init__(self, initialState, lastStartTag=None):
     17         self.tokenizer = HTMLTokenizer
     18         self._state = initialState
     19         self._lastStartTag = lastStartTag
     20 
     21     def parse(self, stream, encoding=None, innerHTML=False):
     22         tokenizer = self.tokenizer(stream, encoding)
     23         self.outputTokens = []
     24 
     25         tokenizer.state = getattr(tokenizer, self._state)
     26         if self._lastStartTag is not None:
     27             tokenizer.currentToken = {"type": "startTag",
     28                                       "name": self._lastStartTag}
     29 
     30         types = dict((v, k) for k, v in constants.tokenTypes.items())
     31         for token in tokenizer:
     32             getattr(self, 'process%s' % types[token["type"]])(token)
     33 
     34         return self.outputTokens
     35 
     36     def processDoctype(self, token):
     37         self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
     38                                   token["systemId"], token["correct"]])
     39 
     40     def processStartTag(self, token):
     41         self.outputTokens.append(["StartTag", token["name"],
     42                                   dict(token["data"][::-1]), token["selfClosing"]])
     43 
     44     def processEmptyTag(self, token):
     45         if token["name"] not in constants.voidElements:
     46             self.outputTokens.append("ParseError")
     47         self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
     48 
     49     def processEndTag(self, token):
     50         self.outputTokens.append(["EndTag", token["name"],
     51                                   token["selfClosing"]])
     52 
     53     def processComment(self, token):
     54         self.outputTokens.append(["Comment", token["data"]])
     55 
     56     def processSpaceCharacters(self, token):
     57         self.outputTokens.append(["Character", token["data"]])
     58         self.processSpaceCharacters = self.processCharacters
     59 
     60     def processCharacters(self, token):
     61         self.outputTokens.append(["Character", token["data"]])
     62 
     63     def processEOF(self, token):
     64         pass
     65 
     66     def processParseError(self, token):
     67         self.outputTokens.append(["ParseError", token["data"]])
     68 
     69 
     70 def concatenateCharacterTokens(tokens):
     71     outputTokens = []
     72     for token in tokens:
     73         if "ParseError" not in token and token[0] == "Character":
     74             if (outputTokens and "ParseError" not in outputTokens[-1] and
     75                     outputTokens[-1][0] == "Character"):
     76                 outputTokens[-1][1] += token[1]
     77             else:
     78                 outputTokens.append(token)
     79         else:
     80             outputTokens.append(token)
     81     return outputTokens
     82 
     83 
     84 def normalizeTokens(tokens):
     85     # TODO: convert tests to reflect arrays
     86     for i, token in enumerate(tokens):
     87         if token[0] == 'ParseError':
     88             tokens[i] = token[0]
     89     return tokens
     90 
     91 
     92 def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
     93                 ignoreErrors=False):
     94     """Test whether the test has passed or failed
     95 
     96     If the ignoreErrorOrder flag is set to true we don't test the relative
     97     positions of parse errors and non parse errors
     98     """
     99     checkSelfClosing = False
    100     for token in expectedTokens:
    101         if (token[0] == "StartTag" and len(token) == 4
    102                 or token[0] == "EndTag" and len(token) == 3):
    103             checkSelfClosing = True
    104             break
    105 
    106     if not checkSelfClosing:
    107         for token in receivedTokens:
    108             if token[0] == "StartTag" or token[0] == "EndTag":
    109                 token.pop()
    110 
    111     if not ignoreErrorOrder and not ignoreErrors:
    112         return expectedTokens == receivedTokens
    113     else:
    114         # Sort the tokens into two groups; non-parse errors and parse errors
    115         tokens = {"expected": [[], []], "received": [[], []]}
    116         for tokenType, tokenList in zip(list(tokens.keys()),
    117                                         (expectedTokens, receivedTokens)):
    118             for token in tokenList:
    119                 if token != "ParseError":
    120                     tokens[tokenType][0].append(token)
    121                 else:
    122                     if not ignoreErrors:
    123                         tokens[tokenType][1].append(token)
    124         return tokens["expected"] == tokens["received"]
    125 
    126 
    127 _surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
    128 
    129 
    130 def unescape(test):
    131     def decode(inp):
    132         """Decode \\uXXXX escapes
    133 
    134         This decodes \\uXXXX escapes, possibly into non-BMP characters when
    135         two surrogate character escapes are adjacent to each other.
    136         """
    137         # This cannot be implemented using the unicode_escape codec
    138         # because that requires its input be ISO-8859-1, and we need
    139         # arbitrary unicode as input.
    140         def repl(m):
    141             if m.group(2) is not None:
    142                 high = int(m.group(1), 16)
    143                 low = int(m.group(2), 16)
    144                 if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
    145                     cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
    146                     return unichr(cp)
    147                 else:
    148                     return unichr(high) + unichr(low)
    149             else:
    150                 return unichr(int(m.group(1), 16))
    151         try:
    152             return _surrogateRe.sub(repl, inp)
    153         except ValueError:
    154             # This occurs when unichr throws ValueError, which should
    155             # only be for a lone-surrogate.
    156             if utils.supports_lone_surrogates:
    157                 raise
    158             return None
    159 
    160     test["input"] = decode(test["input"])
    161     for token in test["output"]:
    162         if token == "ParseError":
    163             continue
    164         else:
    165             token[1] = decode(token[1])
    166             if len(token) > 2:
    167                 for key, value in token[2]:
    168                     del token[2][key]
    169                     token[2][decode(key)] = decode(value)
    170     return test
    171 
    172 
    173 def runTokenizerTest(test):
    174     warnings.resetwarnings()
    175     warnings.simplefilter("error")
    176 
    177     expected = concatenateCharacterTokens(test['output'])
    178     if 'lastStartTag' not in test:
    179         test['lastStartTag'] = None
    180     parser = TokenizerTestParser(test['initialState'],
    181                                  test['lastStartTag'])
    182     tokens = parser.parse(test['input'])
    183     tokens = concatenateCharacterTokens(tokens)
    184     received = normalizeTokens(tokens)
    185     errorMsg = "\n".join(["\n\nInitial state:",
    186                           test['initialState'],
    187                           "\nInput:", test['input'],
    188                           "\nExpected:", repr(expected),
    189                           "\nreceived:", repr(tokens)])
    190     errorMsg = errorMsg
    191     ignoreErrorOrder = test.get('ignoreErrorOrder', False)
    192     assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg
    193 
    194 
    195 def _doCapitalize(match):
    196     return match.group(1).upper()
    197 
    198 _capitalizeRe = re.compile(r"\W+(\w)").sub
    199 
    200 
    201 def capitalize(s):
    202     s = s.lower()
    203     s = _capitalizeRe(_doCapitalize, s)
    204     return s
    205 
    206 
    207 def testTokenizer():
    208     for filename in get_data_files('tokenizer', '*.test'):
    209         with open(filename) as fp:
    210             tests = json.load(fp)
    211             if 'tests' in tests:
    212                 for index, test in enumerate(tests['tests']):
    213                     if 'initialStates' not in test:
    214                         test["initialStates"] = ["Data state"]
    215                     if 'doubleEscaped' in test:
    216                         test = unescape(test)
    217                         if test["input"] is None:
    218                             continue  # Not valid input for this platform
    219                     for initialState in test["initialStates"]:
    220                         test["initialState"] = capitalize(initialState)
    221                         yield runTokenizerTest, test
    222