1 from __future__ import absolute_import, division, unicode_literals 2 3 import json 4 import warnings 5 import re 6 7 from six import unichr 8 9 from .support import get_data_files 10 11 from html5lib.tokenizer import HTMLTokenizer 12 from html5lib import constants, utils 13 14 15 class TokenizerTestParser(object): 16 def __init__(self, initialState, lastStartTag=None): 17 self.tokenizer = HTMLTokenizer 18 self._state = initialState 19 self._lastStartTag = lastStartTag 20 21 def parse(self, stream, encoding=None, innerHTML=False): 22 tokenizer = self.tokenizer(stream, encoding) 23 self.outputTokens = [] 24 25 tokenizer.state = getattr(tokenizer, self._state) 26 if self._lastStartTag is not None: 27 tokenizer.currentToken = {"type": "startTag", 28 "name": self._lastStartTag} 29 30 types = dict((v, k) for k, v in constants.tokenTypes.items()) 31 for token in tokenizer: 32 getattr(self, 'process%s' % types[token["type"]])(token) 33 34 return self.outputTokens 35 36 def processDoctype(self, token): 37 self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"], 38 token["systemId"], token["correct"]]) 39 40 def processStartTag(self, token): 41 self.outputTokens.append(["StartTag", token["name"], 42 dict(token["data"][::-1]), token["selfClosing"]]) 43 44 def processEmptyTag(self, token): 45 if token["name"] not in constants.voidElements: 46 self.outputTokens.append("ParseError") 47 self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])]) 48 49 def processEndTag(self, token): 50 self.outputTokens.append(["EndTag", token["name"], 51 token["selfClosing"]]) 52 53 def processComment(self, token): 54 self.outputTokens.append(["Comment", token["data"]]) 55 56 def processSpaceCharacters(self, token): 57 self.outputTokens.append(["Character", token["data"]]) 58 self.processSpaceCharacters = self.processCharacters 59 60 def processCharacters(self, token): 61 self.outputTokens.append(["Character", token["data"]]) 62 63 def processEOF(self, token): 64 pass 65 66 def processParseError(self, token): 67 self.outputTokens.append(["ParseError", token["data"]]) 68 69 70 def concatenateCharacterTokens(tokens): 71 outputTokens = [] 72 for token in tokens: 73 if "ParseError" not in token and token[0] == "Character": 74 if (outputTokens and "ParseError" not in outputTokens[-1] and 75 outputTokens[-1][0] == "Character"): 76 outputTokens[-1][1] += token[1] 77 else: 78 outputTokens.append(token) 79 else: 80 outputTokens.append(token) 81 return outputTokens 82 83 84 def normalizeTokens(tokens): 85 # TODO: convert tests to reflect arrays 86 for i, token in enumerate(tokens): 87 if token[0] == 'ParseError': 88 tokens[i] = token[0] 89 return tokens 90 91 92 def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, 93 ignoreErrors=False): 94 """Test whether the test has passed or failed 95 96 If the ignoreErrorOrder flag is set to true we don't test the relative 97 positions of parse errors and non parse errors 98 """ 99 checkSelfClosing = False 100 for token in expectedTokens: 101 if (token[0] == "StartTag" and len(token) == 4 102 or token[0] == "EndTag" and len(token) == 3): 103 checkSelfClosing = True 104 break 105 106 if not checkSelfClosing: 107 for token in receivedTokens: 108 if token[0] == "StartTag" or token[0] == "EndTag": 109 token.pop() 110 111 if not ignoreErrorOrder and not ignoreErrors: 112 return expectedTokens == receivedTokens 113 else: 114 # Sort the tokens into two groups; non-parse errors and parse errors 115 tokens = {"expected": [[], []], "received": [[], []]} 116 for tokenType, tokenList in zip(list(tokens.keys()), 117 (expectedTokens, receivedTokens)): 118 for token in tokenList: 119 if token != "ParseError": 120 tokens[tokenType][0].append(token) 121 else: 122 if not ignoreErrors: 123 tokens[tokenType][1].append(token) 124 return tokens["expected"] == tokens["received"] 125 126 127 _surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") 128 129 130 def unescape(test): 131 def decode(inp): 132 """Decode \\uXXXX escapes 133 134 This decodes \\uXXXX escapes, possibly into non-BMP characters when 135 two surrogate character escapes are adjacent to each other. 136 """ 137 # This cannot be implemented using the unicode_escape codec 138 # because that requires its input be ISO-8859-1, and we need 139 # arbitrary unicode as input. 140 def repl(m): 141 if m.group(2) is not None: 142 high = int(m.group(1), 16) 143 low = int(m.group(2), 16) 144 if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF: 145 cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 146 return unichr(cp) 147 else: 148 return unichr(high) + unichr(low) 149 else: 150 return unichr(int(m.group(1), 16)) 151 try: 152 return _surrogateRe.sub(repl, inp) 153 except ValueError: 154 # This occurs when unichr throws ValueError, which should 155 # only be for a lone-surrogate. 156 if utils.supports_lone_surrogates: 157 raise 158 return None 159 160 test["input"] = decode(test["input"]) 161 for token in test["output"]: 162 if token == "ParseError": 163 continue 164 else: 165 token[1] = decode(token[1]) 166 if len(token) > 2: 167 for key, value in token[2]: 168 del token[2][key] 169 token[2][decode(key)] = decode(value) 170 return test 171 172 173 def runTokenizerTest(test): 174 warnings.resetwarnings() 175 warnings.simplefilter("error") 176 177 expected = concatenateCharacterTokens(test['output']) 178 if 'lastStartTag' not in test: 179 test['lastStartTag'] = None 180 parser = TokenizerTestParser(test['initialState'], 181 test['lastStartTag']) 182 tokens = parser.parse(test['input']) 183 tokens = concatenateCharacterTokens(tokens) 184 received = normalizeTokens(tokens) 185 errorMsg = "\n".join(["\n\nInitial state:", 186 test['initialState'], 187 "\nInput:", test['input'], 188 "\nExpected:", repr(expected), 189 "\nreceived:", repr(tokens)]) 190 errorMsg = errorMsg 191 ignoreErrorOrder = test.get('ignoreErrorOrder', False) 192 assert tokensMatch(expected, received, ignoreErrorOrder, True), errorMsg 193 194 195 def _doCapitalize(match): 196 return match.group(1).upper() 197 198 _capitalizeRe = re.compile(r"\W+(\w)").sub 199 200 201 def capitalize(s): 202 s = s.lower() 203 s = _capitalizeRe(_doCapitalize, s) 204 return s 205 206 207 def testTokenizer(): 208 for filename in get_data_files('tokenizer', '*.test'): 209 with open(filename) as fp: 210 tests = json.load(fp) 211 if 'tests' in tests: 212 for index, test in enumerate(tests['tests']): 213 if 'initialStates' not in test: 214 test["initialStates"] = ["Data state"] 215 if 'doubleEscaped' in test: 216 test = unescape(test) 217 if test["input"] is None: 218 continue # Not valid input for this platform 219 for initialState in test["initialStates"]: 220 test["initialState"] = capitalize(initialState) 221 yield runTokenizerTest, test 222