1 from test.support import (gc_collect, bigmemtest, _2G, 2 cpython_only, captured_stdout) 3 import locale 4 import re 5 import sre_compile 6 import string 7 import unittest 8 import warnings 9 from re import Scanner 10 from weakref import proxy 11 12 # Misc tests from Tim Peters' re.doc 13 14 # WARNING: Don't change details in these tests if you don't know 15 # what you're doing. Some of these tests were carefully modeled to 16 # cover most of the code. 17 18 class S(str): 19 def __getitem__(self, index): 20 return S(super().__getitem__(index)) 21 22 class B(bytes): 23 def __getitem__(self, index): 24 return B(super().__getitem__(index)) 25 26 class ReTests(unittest.TestCase): 27 28 def assertTypedEqual(self, actual, expect, msg=None): 29 self.assertEqual(actual, expect, msg) 30 def recurse(actual, expect): 31 if isinstance(expect, (tuple, list)): 32 for x, y in zip(actual, expect): 33 recurse(x, y) 34 else: 35 self.assertIs(type(actual), type(expect), msg) 36 recurse(actual, expect) 37 38 def checkPatternError(self, pattern, errmsg, pos=None): 39 with self.assertRaises(re.error) as cm: 40 re.compile(pattern) 41 with self.subTest(pattern=pattern): 42 err = cm.exception 43 self.assertEqual(err.msg, errmsg) 44 if pos is not None: 45 self.assertEqual(err.pos, pos) 46 47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): 48 with self.assertRaises(re.error) as cm: 49 re.sub(pattern, repl, string) 50 with self.subTest(pattern=pattern, repl=repl): 51 err = cm.exception 52 self.assertEqual(err.msg, errmsg) 53 if pos is not None: 54 self.assertEqual(err.pos, pos) 55 56 def test_keep_buffer(self): 57 # See bug 14212 58 b = bytearray(b'x') 59 it = re.finditer(b'a', b) 60 with self.assertRaises(BufferError): 61 b.extend(b'x'*400) 62 list(it) 63 del it 64 gc_collect() 65 b.extend(b'x'*400) 66 67 def test_weakref(self): 68 s = 'QabbbcR' 69 x = re.compile('ab+c') 70 y = proxy(x) 71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 72 73 def test_search_star_plus(self): 74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 78 self.assertIsNone(re.search('x', 'aaa')) 79 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 83 self.assertIsNone(re.match('a+', 'xxx')) 84 85 def bump_num(self, matchobj): 86 int_value = int(matchobj.group(0)) 87 return str(int_value + 1) 88 89 def test_basic_re_sub(self): 90 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') 91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') 92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz') 93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') 94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') 95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz') 96 for y in ("\xe0", "\u0430", "\U0001d49c"): 97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz') 98 99 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 101 '9.3 -3 24x100y') 102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 103 '9.3 -3 23x99y') 104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), 105 '9.3 -3 23x99y') 106 107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 109 110 s = r"\1\1" 111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 112 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s) 113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 114 115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') 116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') 117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') 118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') 119 120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 125 with self.subTest(c): 126 with self.assertRaises(re.error): 127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 128 129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') 130 131 def test_bug_449964(self): 132 # fails for group followed by other escape 133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), 134 'xx\bxx\b') 135 136 def test_bug_449000(self): 137 # Test for sub() on escaped characters 138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 139 'abc\ndef\n') 140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 141 'abc\ndef\n') 142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 143 'abc\ndef\n') 144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 145 'abc\ndef\n') 146 147 def test_bug_1661(self): 148 # Verify that flags do not get silently ignored with compiled patterns 149 pattern = re.compile('.') 150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 153 self.assertRaises(ValueError, re.compile, pattern, re.I) 154 155 def test_bug_3629(self): 156 # A regex that triggered a bug in the sre-code validator 157 re.compile("(?P<quote>)(?(quote))") 158 159 def test_sub_template_numeric_escape(self): 160 # bug 776311 and friends 161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377') 169 170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 172 173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 178 179 self.checkTemplateError('x', r'\400', 'x', 180 r'octal escape value \400 outside of ' 181 r'range 0-0o377', 0) 182 self.checkTemplateError('x', r'\777', 'x', 183 r'octal escape value \777 outside of ' 184 r'range 0-0o377', 0) 185 186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) 187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) 188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) 189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) 190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) 191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) 192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) 193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) 194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) 195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) 196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) 197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) 198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) 199 200 # in python2.3 (etc), these loop endlessly in sre_parser.py 201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 203 'xz8') 204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 205 'xza') 206 207 def test_qualified_re_sub(self): 208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') 211 212 def test_bug_114660(self): 213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 214 'hello there') 215 216 def test_symbolic_groups(self): 217 re.compile(r'(?P<a>x)(?P=a)(?(a)y)') 218 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)') 219 re.compile(r'(?P<a1>x)\1(?(1)y)') 220 self.checkPatternError(r'(?P<a>)(?P<a>)', 221 "redefinition of group name 'a' as group 2; " 222 "was group 1") 223 self.checkPatternError(r'(?P<a>(?P=a))', 224 "cannot refer to an open group", 10) 225 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px') 226 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11) 227 self.checkPatternError(r'(?P=', 'missing group name', 4) 228 self.checkPatternError(r'(?P=)', 'missing group name', 4) 229 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4) 230 self.checkPatternError(r'(?P=a)', "unknown group name 'a'") 231 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'") 232 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4) 233 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4) 234 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4) 235 self.checkPatternError(r'(?P<', 'missing group name', 4) 236 self.checkPatternError(r'(?P<>)', 'missing group name', 4) 237 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) 238 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) 239 self.checkPatternError(r'(?(', 'missing group name', 3) 240 self.checkPatternError(r'(?())', 'missing group name', 3) 241 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) 242 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) 243 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) 244 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) 245 # New valid/invalid identifiers in Python 3 246 re.compile('(?P<>x)(?P=)(?()y)') 247 re.compile('(?P<>x)(?P=)(?()y)') 248 self.checkPatternError('(?P<>x)', "bad character in group name ''", 4) 249 # Support > 100 groups. 250 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 251 pat = '(?:%s)(?(200)z|t)' % pat 252 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 253 254 def test_symbolic_refs(self): 255 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx', 256 'missing >, unterminated name', 3) 257 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx', 258 'missing group name', 3) 259 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2) 260 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx', 261 "bad character in group name 'a a'", 3) 262 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 263 'missing group name', 3) 264 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx', 265 "bad character in group name '1a1'", 3) 266 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', 267 'invalid group reference 2', 3) 268 self.checkTemplateError('(?P<a>x)', r'\2', 'xx', 269 'invalid group reference 2', 1) 270 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): 271 re.sub('(?P<a>x)', r'\g<ab>', 'xx') 272 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') 273 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') 274 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', 275 "bad character in group name '-1'", 3) 276 # New valid/invalid identifiers in Python 3 277 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 278 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 279 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 280 "bad character in group name ''", 3) 281 # Support > 100 groups. 282 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 283 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') 284 285 def test_re_subn(self): 286 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 287 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 288 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 289 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 290 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 291 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) 292 293 def test_re_split(self): 294 for string in ":a:b::c", S(":a:b::c"): 295 self.assertTypedEqual(re.split(":", string), 296 ['', 'a', 'b', '', 'c']) 297 self.assertTypedEqual(re.split(":+", string), 298 ['', 'a', 'b', 'c']) 299 self.assertTypedEqual(re.split("(:+)", string), 300 ['', ':', 'a', ':', 'b', '::', 'c']) 301 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), 302 memoryview(b":a:b::c")): 303 self.assertTypedEqual(re.split(b":", string), 304 [b'', b'a', b'b', b'', b'c']) 305 self.assertTypedEqual(re.split(b":+", string), 306 [b'', b'a', b'b', b'c']) 307 self.assertTypedEqual(re.split(b"(:+)", string), 308 [b'', b':', b'a', b':', b'b', b'::', b'c']) 309 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", 310 "\U0001d49c\U0001d49e\U0001d4b5"): 311 string = ":%s:%s::%s" % (a, b, c) 312 self.assertEqual(re.split(":", string), ['', a, b, '', c]) 313 self.assertEqual(re.split(":+", string), ['', a, b, c]) 314 self.assertEqual(re.split("(:+)", string), 315 ['', ':', a, ':', b, '::', c]) 316 317 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 318 self.assertEqual(re.split("(:)+", ":a:b::c"), 319 ['', ':', 'a', ':', 'b', ':', 'c']) 320 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 321 ['', ':', 'a', ':b::', 'c']) 322 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 323 ['', None, ':', 'a', None, ':', '', 'b', None, '', 324 None, '::', 'c']) 325 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 326 ['', 'a', '', '', 'c']) 327 328 for sep, expected in [ 329 (':*', ['', '', 'a', '', 'b', '', 'c', '']), 330 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), 331 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), 332 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), 333 ]: 334 with self.subTest(sep=sep): 335 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 336 337 for sep, expected in [ 338 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), 339 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), 340 (r'(?=:)', ['', ':a', ':b', ':', ':c']), 341 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), 342 ]: 343 with self.subTest(sep=sep): 344 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 345 346 def test_qualified_re_split(self): 347 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) 349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) 350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), 351 ['', ':', 'a', ':', 'b::c']) 352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), 353 ['', ':', 'a', ':', 'b::c']) 354 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 355 ['', ':', '', '', 'a:b::c']) 356 357 def test_re_findall(self): 358 self.assertEqual(re.findall(":+", "abc"), []) 359 for string in "a:b::c:::d", S("a:b::c:::d"): 360 self.assertTypedEqual(re.findall(":+", string), 361 [":", "::", ":::"]) 362 self.assertTypedEqual(re.findall("(:+)", string), 363 [":", "::", ":::"]) 364 self.assertTypedEqual(re.findall("(:)(:*)", string), 365 [(":", ""), (":", ":"), (":", "::")]) 366 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"), 367 memoryview(b"a:b::c:::d")): 368 self.assertTypedEqual(re.findall(b":+", string), 369 [b":", b"::", b":::"]) 370 self.assertTypedEqual(re.findall(b"(:+)", string), 371 [b":", b"::", b":::"]) 372 self.assertTypedEqual(re.findall(b"(:)(:*)", string), 373 [(b":", b""), (b":", b":"), (b":", b"::")]) 374 for x in ("\xe0", "\u0430", "\U0001d49c"): 375 xx = x * 2 376 xxx = x * 3 377 string = "a%sb%sc%sd" % (x, xx, xxx) 378 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx]) 379 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx]) 380 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string), 381 [(x, ""), (x, x), (x, xx)]) 382 383 def test_bug_117612(self): 384 self.assertEqual(re.findall(r"(a|(b))", "aba"), 385 [("a", ""),("b", "b"),("a", "")]) 386 387 def test_re_match(self): 388 for string in 'a', S('a'): 389 self.assertEqual(re.match('a', string).groups(), ()) 390 self.assertEqual(re.match('(a)', string).groups(), ('a',)) 391 self.assertEqual(re.match('(a)', string).group(0), 'a') 392 self.assertEqual(re.match('(a)', string).group(1), 'a') 393 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a')) 394 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'): 395 self.assertEqual(re.match(b'a', string).groups(), ()) 396 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',)) 397 self.assertEqual(re.match(b'(a)', string).group(0), b'a') 398 self.assertEqual(re.match(b'(a)', string).group(1), b'a') 399 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a')) 400 for a in ("\xe0", "\u0430", "\U0001d49c"): 401 self.assertEqual(re.match(a, a).groups(), ()) 402 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,)) 403 self.assertEqual(re.match('(%s)' % a, a).group(0), a) 404 self.assertEqual(re.match('(%s)' % a, a).group(1), a) 405 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a)) 406 407 pat = re.compile('((a)|(b))(c)?') 408 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 409 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 410 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 411 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 412 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 413 414 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 415 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 416 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 417 (None, 'b', None)) 418 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 419 420 def test_group(self): 421 class Index: 422 def __init__(self, value): 423 self.value = value 424 def __index__(self): 425 return self.value 426 # A single group 427 m = re.match('(a)(b)', 'ab') 428 self.assertEqual(m.group(), 'ab') 429 self.assertEqual(m.group(0), 'ab') 430 self.assertEqual(m.group(1), 'a') 431 self.assertEqual(m.group(Index(1)), 'a') 432 self.assertRaises(IndexError, m.group, -1) 433 self.assertRaises(IndexError, m.group, 3) 434 self.assertRaises(IndexError, m.group, 1<<1000) 435 self.assertRaises(IndexError, m.group, Index(1<<1000)) 436 self.assertRaises(IndexError, m.group, 'x') 437 # Multiple groups 438 self.assertEqual(m.group(2, 1), ('b', 'a')) 439 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a')) 440 441 def test_match_getitem(self): 442 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 443 444 m = pat.match('a') 445 self.assertEqual(m['a1'], 'a') 446 self.assertEqual(m['b2'], None) 447 self.assertEqual(m['c3'], None) 448 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None') 449 self.assertEqual(m[0], 'a') 450 self.assertEqual(m[1], 'a') 451 self.assertEqual(m[2], None) 452 self.assertEqual(m[3], None) 453 with self.assertRaisesRegex(IndexError, 'no such group'): 454 m['X'] 455 with self.assertRaisesRegex(IndexError, 'no such group'): 456 m[-1] 457 with self.assertRaisesRegex(IndexError, 'no such group'): 458 m[4] 459 with self.assertRaisesRegex(IndexError, 'no such group'): 460 m[0, 1] 461 with self.assertRaisesRegex(IndexError, 'no such group'): 462 m[(0,)] 463 with self.assertRaisesRegex(IndexError, 'no such group'): 464 m[(0, 1)] 465 with self.assertRaisesRegex(IndexError, 'no such group'): 466 'a1={a2}'.format_map(m) 467 468 m = pat.match('ac') 469 self.assertEqual(m['a1'], 'a') 470 self.assertEqual(m['b2'], None) 471 self.assertEqual(m['c3'], 'c') 472 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c') 473 self.assertEqual(m[0], 'ac') 474 self.assertEqual(m[1], 'a') 475 self.assertEqual(m[2], None) 476 self.assertEqual(m[3], 'c') 477 478 # Cannot assign. 479 with self.assertRaises(TypeError): 480 m[0] = 1 481 482 # No len(). 483 self.assertRaises(TypeError, len, m) 484 485 def test_re_fullmatch(self): 486 # Issue 16203: Proposal: add re.fullmatch() method. 487 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) 488 for string in "ab", S("ab"): 489 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2)) 490 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"): 491 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2)) 492 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": 493 r = r"%s|%s" % (a, a + b) 494 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2)) 495 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) 496 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) 497 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) 498 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) 499 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) 500 self.assertIsNone(re.fullmatch(r"a+", "ab")) 501 self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) 502 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) 503 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) 504 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) 505 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) 506 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) 507 508 self.assertEqual( 509 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 510 self.assertEqual( 511 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 512 self.assertEqual( 513 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 514 515 def test_re_groupref_exists(self): 516 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 517 ('(', 'a')) 518 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(), 519 (None, 'a')) 520 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)')) 521 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a')) 522 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 523 ('a', 'b')) 524 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 525 (None, 'd')) 526 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 527 (None, 'd')) 528 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(), 529 ('a', '')) 530 531 # Tests for bug #1177831: exercise groups other than the first group 532 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 533 self.assertEqual(p.match('abc').groups(), 534 ('a', 'b', 'c')) 535 self.assertEqual(p.match('ad').groups(), 536 ('a', None, 'd')) 537 self.assertIsNone(p.match('abd')) 538 self.assertIsNone(p.match('ac')) 539 540 # Support > 100 groups. 541 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 542 pat = '(?:%s)(?(200)z)' % pat 543 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 544 545 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10) 546 self.checkPatternError(r'()(?(1)a|b', 547 'missing ), unterminated subpattern', 2) 548 self.checkPatternError(r'()(?(1)a|b|c)', 549 'conditional backref with more than ' 550 'two branches', 10) 551 552 def test_re_groupref_overflow(self): 553 from sre_constants import MAXGROUPS 554 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 555 'invalid group reference %d' % MAXGROUPS, 3) 556 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, 557 'invalid group reference %d' % MAXGROUPS, 10) 558 559 def test_re_groupref(self): 560 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 561 ('|', 'a')) 562 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 563 (None, 'a')) 564 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 565 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 566 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 567 ('a', 'a')) 568 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 569 (None, None)) 570 571 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) 572 573 def test_groupdict(self): 574 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 575 'first second').groupdict(), 576 {'first':'first', 'second':'second'}) 577 578 def test_expand(self): 579 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 580 "first second") 581 .expand(r"\2 \1 \g<second> \g<first>"), 582 "second first second first") 583 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", 584 "first") 585 .expand(r"\2 \g<second>"), 586 " ") 587 588 def test_repeat_minmax(self): 589 self.assertIsNone(re.match(r"^(\w){1}$", "abc")) 590 self.assertIsNone(re.match(r"^(\w){1}?$", "abc")) 591 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc")) 592 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc")) 593 594 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c") 595 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c") 596 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c") 597 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 598 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c") 599 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c") 600 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c") 601 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 602 603 self.assertIsNone(re.match(r"^x{1}$", "xxx")) 604 self.assertIsNone(re.match(r"^x{1}?$", "xxx")) 605 self.assertIsNone(re.match(r"^x{1,2}$", "xxx")) 606 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx")) 607 608 self.assertTrue(re.match(r"^x{3}$", "xxx")) 609 self.assertTrue(re.match(r"^x{1,3}$", "xxx")) 610 self.assertTrue(re.match(r"^x{3,3}$", "xxx")) 611 self.assertTrue(re.match(r"^x{1,4}$", "xxx")) 612 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 613 self.assertTrue(re.match(r"^x{3}?$", "xxx")) 614 self.assertTrue(re.match(r"^x{1,3}?$", "xxx")) 615 self.assertTrue(re.match(r"^x{1,4}?$", "xxx")) 616 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 617 618 self.assertIsNone(re.match(r"^x{}$", "xxx")) 619 self.assertTrue(re.match(r"^x{}$", "x{}")) 620 621 self.checkPatternError(r'x{2,1}', 622 'min repeat greater than max repeat', 2) 623 624 def test_getattr(self): 625 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") 626 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) 627 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) 628 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) 629 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, 630 {'first': 1, 'other': 2}) 631 632 self.assertEqual(re.match("(a)", "a").pos, 0) 633 self.assertEqual(re.match("(a)", "a").endpos, 1) 634 self.assertEqual(re.match("(a)", "a").string, "a") 635 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 636 self.assertTrue(re.match("(a)", "a").re) 637 638 # Issue 14260. groupindex should be non-modifiable mapping. 639 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') 640 self.assertEqual(sorted(p.groupindex), ['first', 'other']) 641 self.assertEqual(p.groupindex['other'], 2) 642 with self.assertRaises(TypeError): 643 p.groupindex['other'] = 0 644 self.assertEqual(p.groupindex['other'], 2) 645 646 def test_special_escapes(self): 647 self.assertEqual(re.search(r"\b(b.)\b", 648 "abcd abc bcd bx").group(1), "bx") 649 self.assertEqual(re.search(r"\B(b.)\B", 650 "abc bcd bc abxd").group(1), "bx") 651 self.assertEqual(re.search(r"\b(b.)\b", 652 "abcd abc bcd bx", re.ASCII).group(1), "bx") 653 self.assertEqual(re.search(r"\B(b.)\B", 654 "abc bcd bc abxd", re.ASCII).group(1), "bx") 655 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 656 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 657 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 658 self.assertEqual(re.search(br"\b(b.)\b", 659 b"abcd abc bcd bx").group(1), b"bx") 660 self.assertEqual(re.search(br"\B(b.)\B", 661 b"abc bcd bc abxd").group(1), b"bx") 662 self.assertEqual(re.search(br"\b(b.)\b", 663 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx") 664 self.assertEqual(re.search(br"\B(b.)\B", 665 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") 666 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") 667 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") 668 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) 669 self.assertEqual(re.search(r"\d\D\w\W\s\S", 670 "1aa! a").group(0), "1aa! a") 671 self.assertEqual(re.search(br"\d\D\w\W\s\S", 672 b"1aa! a").group(0), b"1aa! a") 673 self.assertEqual(re.search(r"\d\D\w\W\s\S", 674 "1aa! a", re.ASCII).group(0), "1aa! a") 675 self.assertEqual(re.search(br"\d\D\w\W\s\S", 676 b"1aa! a", re.LOCALE).group(0), b"1aa! a") 677 678 def test_other_escapes(self): 679 self.checkPatternError("\\", 'bad escape (end of pattern)', 0) 680 self.assertEqual(re.match(r"\(", '(').group(), '(') 681 self.assertIsNone(re.match(r"\(", ')')) 682 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 683 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 684 self.assertIsNone(re.match(r"[\]]", '[')) 685 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 686 self.assertIsNone(re.match(r"[a\-c]", 'b')) 687 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 688 self.assertIsNone(re.match(r"[\^a]+", 'b')) 689 re.purge() # for warnings 690 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': 691 with self.subTest(c): 692 self.assertRaises(re.error, re.compile, '\\%c' % c) 693 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': 694 with self.subTest(c): 695 self.assertRaises(re.error, re.compile, '[\\%c]' % c) 696 697 def test_string_boundaries(self): 698 # See http://bugs.python.org/issue10713 699 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 700 "abc") 701 # There's a word boundary at the start of a string. 702 self.assertTrue(re.match(r"\b", "abc")) 703 # A non-empty string includes a non-boundary zero-length match. 704 self.assertTrue(re.search(r"\B", "abc")) 705 # There is no non-boundary match at the start of a string. 706 self.assertFalse(re.match(r"\B", "abc")) 707 # However, an empty string contains no word boundaries, and also no 708 # non-boundaries. 709 self.assertIsNone(re.search(r"\B", "")) 710 # This one is questionable and different from the perlre behaviour, 711 # but describes current behavior. 712 self.assertIsNone(re.search(r"\b", "")) 713 # A single word-character string has two boundaries, but no 714 # non-boundary gaps. 715 self.assertEqual(len(re.findall(r"\b", "a")), 2) 716 self.assertEqual(len(re.findall(r"\B", "a")), 0) 717 # If there are no words, there are no boundaries 718 self.assertEqual(len(re.findall(r"\b", " ")), 0) 719 self.assertEqual(len(re.findall(r"\b", " ")), 0) 720 # Can match around the whitespace. 721 self.assertEqual(len(re.findall(r"\B", " ")), 2) 722 723 def test_bigcharset(self): 724 self.assertEqual(re.match("([\u2222\u2223])", 725 "\u2222").group(1), "\u2222") 726 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255))) 727 self.assertEqual(re.match(r, "\uff01").group(), "\uff01") 728 729 def test_big_codesize(self): 730 # Issue #1160 731 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 732 self.assertTrue(r.match('1000')) 733 self.assertTrue(r.match('9999')) 734 735 def test_anyall(self): 736 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 737 "a\nb") 738 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 739 "a\n\nb") 740 741 def test_lookahead(self): 742 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a") 743 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a") 744 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a") 745 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a") 746 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 747 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 748 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 749 750 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 751 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 752 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 753 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 754 755 # Group reference. 756 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 757 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 758 # Conditional group reference. 759 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 760 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 761 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 762 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 763 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 764 # Group used before defined. 765 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 766 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 767 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 768 769 def test_lookbehind(self): 770 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 771 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 772 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 773 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 774 # Group reference. 775 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) 776 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) 777 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) 778 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) 779 # Conditional group reference. 780 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) 781 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) 782 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) 783 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) 784 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) 785 # Group used before defined. 786 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') 787 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) 788 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) 789 # Group defined in the same lookbehind pattern 790 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') 791 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') 792 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') 793 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') 794 795 def test_ignore_case(self): 796 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 797 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 798 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 799 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 800 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 801 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 802 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 803 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 804 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 805 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 806 807 assert '\u212a'.lower() == 'k' # '' 808 self.assertTrue(re.match(r'K', '\u212a', re.I)) 809 self.assertTrue(re.match(r'k', '\u212a', re.I)) 810 self.assertTrue(re.match(r'\u212a', 'K', re.I)) 811 self.assertTrue(re.match(r'\u212a', 'k', re.I)) 812 assert '\u017f'.upper() == 'S' # '' 813 self.assertTrue(re.match(r'S', '\u017f', re.I)) 814 self.assertTrue(re.match(r's', '\u017f', re.I)) 815 self.assertTrue(re.match(r'\u017f', 'S', re.I)) 816 self.assertTrue(re.match(r'\u017f', 's', re.I)) 817 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', '' 818 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) 819 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) 820 821 def test_ignore_case_set(self): 822 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 823 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 824 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 825 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 826 self.assertTrue(re.match(br'[19A]', b'A', re.I)) 827 self.assertTrue(re.match(br'[19a]', b'a', re.I)) 828 self.assertTrue(re.match(br'[19a]', b'A', re.I)) 829 self.assertTrue(re.match(br'[19A]', b'a', re.I)) 830 assert '\u212a'.lower() == 'k' # '' 831 self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) 832 self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) 833 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) 834 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) 835 assert '\u017f'.upper() == 'S' # '' 836 self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) 837 self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) 838 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) 839 self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) 840 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', '' 841 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) 842 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) 843 844 def test_ignore_case_range(self): 845 # Issues #3511, #17381. 846 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 847 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 848 self.assertTrue(re.match(br'[9-a]', b'_', re.I)) 849 self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) 850 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 851 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 852 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) 853 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 854 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) 855 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) 856 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) 857 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) 858 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) 859 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) 860 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) 861 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) 862 863 assert '\u212a'.lower() == 'k' # '' 864 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) 865 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) 866 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) 867 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) 868 assert '\u017f'.upper() == 'S' # '' 869 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) 870 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) 871 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) 872 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) 873 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', '' 874 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) 875 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) 876 877 def test_category(self): 878 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 879 880 @cpython_only 881 def test_case_helpers(self): 882 import _sre 883 for i in range(128): 884 c = chr(i) 885 lo = ord(c.lower()) 886 self.assertEqual(_sre.ascii_tolower(i), lo) 887 self.assertEqual(_sre.unicode_tolower(i), lo) 888 iscased = c in string.ascii_letters 889 self.assertEqual(_sre.ascii_iscased(i), iscased) 890 self.assertEqual(_sre.unicode_iscased(i), iscased) 891 892 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: 893 c = chr(i) 894 self.assertEqual(_sre.ascii_tolower(i), i) 895 if i != 0x0130: 896 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) 897 iscased = c != c.lower() or c != c.upper() 898 self.assertFalse(_sre.ascii_iscased(i)) 899 self.assertEqual(_sre.unicode_iscased(i), 900 c != c.lower() or c != c.upper()) 901 902 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) 903 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) 904 self.assertFalse(_sre.ascii_iscased(0x0130)) 905 self.assertTrue(_sre.unicode_iscased(0x0130)) 906 907 def test_not_literal(self): 908 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") 909 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") 910 911 def test_possible_set_operations(self): 912 s = bytes(range(128)).decode() 913 with self.assertWarns(FutureWarning): 914 p = re.compile(r'[0-9--1]') 915 self.assertEqual(p.findall(s), list('-./0123456789')) 916 self.assertEqual(re.findall(r'[--1]', s), list('-./01')) 917 with self.assertWarns(FutureWarning): 918 p = re.compile(r'[%--1]') 919 self.assertEqual(p.findall(s), list("%&'()*+,-1")) 920 with self.assertWarns(FutureWarning): 921 p = re.compile(r'[%--]') 922 self.assertEqual(p.findall(s), list("%&'()*+,-")) 923 924 with self.assertWarns(FutureWarning): 925 p = re.compile(r'[0-9&&1]') 926 self.assertEqual(p.findall(s), list('&0123456789')) 927 with self.assertWarns(FutureWarning): 928 p = re.compile(r'[\d&&1]') 929 self.assertEqual(p.findall(s), list('&0123456789')) 930 self.assertEqual(re.findall(r'[&&1]', s), list('&1')) 931 932 with self.assertWarns(FutureWarning): 933 p = re.compile(r'[0-9||a]') 934 self.assertEqual(p.findall(s), list('0123456789a|')) 935 with self.assertWarns(FutureWarning): 936 p = re.compile(r'[\d||a]') 937 self.assertEqual(p.findall(s), list('0123456789a|')) 938 self.assertEqual(re.findall(r'[||1]', s), list('1|')) 939 940 with self.assertWarns(FutureWarning): 941 p = re.compile(r'[0-9~~1]') 942 self.assertEqual(p.findall(s), list('0123456789~')) 943 with self.assertWarns(FutureWarning): 944 p = re.compile(r'[\d~~1]') 945 self.assertEqual(p.findall(s), list('0123456789~')) 946 self.assertEqual(re.findall(r'[~~1]', s), list('1~')) 947 948 with self.assertWarns(FutureWarning): 949 p = re.compile(r'[[0-9]|]') 950 self.assertEqual(p.findall(s), list('0123456789[]')) 951 952 with self.assertWarns(FutureWarning): 953 p = re.compile(r'[[:digit:]|]') 954 self.assertEqual(p.findall(s), list(':[]dgit')) 955 956 def test_search_coverage(self): 957 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") 958 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") 959 960 def assertMatch(self, pattern, text, match=None, span=None, 961 matcher=re.fullmatch): 962 if match is None and span is None: 963 # the pattern matches the whole text 964 match = text 965 span = (0, len(text)) 966 elif match is None or span is None: 967 raise ValueError('If match is not None, span should be specified ' 968 '(and vice versa).') 969 m = matcher(pattern, text) 970 self.assertTrue(m) 971 self.assertEqual(m.group(), match) 972 self.assertEqual(m.span(), span) 973 974 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`' 975 976 def test_re_escape(self): 977 p = ''.join(chr(i) for i in range(256)) 978 for c in p: 979 self.assertMatch(re.escape(c), c) 980 self.assertMatch('[' + re.escape(c) + ']', c) 981 self.assertMatch('(?x)' + re.escape(c), c) 982 self.assertMatch(re.escape(p), p) 983 for c in '-.]{}': 984 self.assertEqual(re.escape(c)[:1], '\\') 985 literal_chars = self.LITERAL_CHARS 986 self.assertEqual(re.escape(literal_chars), literal_chars) 987 988 def test_re_escape_bytes(self): 989 p = bytes(range(256)) 990 for i in p: 991 b = bytes([i]) 992 self.assertMatch(re.escape(b), b) 993 self.assertMatch(b'[' + re.escape(b) + b']', b) 994 self.assertMatch(b'(?x)' + re.escape(b), b) 995 self.assertMatch(re.escape(p), p) 996 for i in b'-.]{}': 997 b = bytes([i]) 998 self.assertEqual(re.escape(b)[:1], b'\\') 999 literal_chars = self.LITERAL_CHARS.encode('ascii') 1000 self.assertEqual(re.escape(literal_chars), literal_chars) 1001 1002 def test_re_escape_non_ascii(self): 1003 s = 'xxx\u2620\u2620\u2620xxx' 1004 s_escaped = re.escape(s) 1005 self.assertEqual(s_escaped, s) 1006 self.assertMatch(s_escaped, s) 1007 self.assertMatch('.%s+.' % re.escape('\u2620'), s, 1008 'x\u2620\u2620\u2620x', (2, 7), re.search) 1009 1010 def test_re_escape_non_ascii_bytes(self): 1011 b = 'y\u2620y\u2620y'.encode('utf-8') 1012 b_escaped = re.escape(b) 1013 self.assertEqual(b_escaped, b) 1014 self.assertMatch(b_escaped, b) 1015 res = re.findall(re.escape('\u2620'.encode('utf-8')), b) 1016 self.assertEqual(len(res), 2) 1017 1018 def test_pickling(self): 1019 import pickle 1020 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE) 1021 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 1022 pickled = pickle.dumps(oldpat, proto) 1023 newpat = pickle.loads(pickled) 1024 self.assertEqual(newpat, oldpat) 1025 # current pickle expects the _compile() reconstructor in re module 1026 from re import _compile 1027 1028 def test_copying(self): 1029 import copy 1030 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?') 1031 self.assertIs(copy.copy(p), p) 1032 self.assertIs(copy.deepcopy(p), p) 1033 m = p.match('12.34') 1034 self.assertIs(copy.copy(m), m) 1035 self.assertIs(copy.deepcopy(m), m) 1036 1037 def test_constants(self): 1038 self.assertEqual(re.I, re.IGNORECASE) 1039 self.assertEqual(re.L, re.LOCALE) 1040 self.assertEqual(re.M, re.MULTILINE) 1041 self.assertEqual(re.S, re.DOTALL) 1042 self.assertEqual(re.X, re.VERBOSE) 1043 1044 def test_flags(self): 1045 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: 1046 self.assertTrue(re.compile('^pattern$', flag)) 1047 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: 1048 self.assertTrue(re.compile(b'^pattern$', flag)) 1049 1050 def test_sre_character_literals(self): 1051 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1052 if i < 256: 1053 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 1054 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 1055 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 1056 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 1057 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 1058 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 1059 if i < 0x10000: 1060 self.assertTrue(re.match(r"\u%04x" % i, chr(i))) 1061 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0")) 1062 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z")) 1063 self.assertTrue(re.match(r"\U%08x" % i, chr(i))) 1064 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0")) 1065 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z")) 1066 self.assertTrue(re.match(r"\0", "\000")) 1067 self.assertTrue(re.match(r"\08", "\0008")) 1068 self.assertTrue(re.match(r"\01", "\001")) 1069 self.assertTrue(re.match(r"\018", "\0018")) 1070 self.checkPatternError(r"\567", 1071 r'octal escape value \567 outside of ' 1072 r'range 0-0o377', 0) 1073 self.checkPatternError(r"\911", 'invalid group reference 91', 1) 1074 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) 1075 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) 1076 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) 1077 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) 1078 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) 1079 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) 1080 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) 1081 1082 def test_sre_character_class_literals(self): 1083 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1084 if i < 256: 1085 self.assertTrue(re.match(r"[\%o]" % i, chr(i))) 1086 self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) 1087 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 1088 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 1089 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 1090 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 1091 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 1092 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 1093 if i < 0x10000: 1094 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) 1095 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) 1096 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) 1097 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) 1098 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) 1099 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) 1100 self.checkPatternError(r"[\567]", 1101 r'octal escape value \567 outside of ' 1102 r'range 0-0o377', 1) 1103 self.checkPatternError(r"[\911]", r'bad escape \9', 1) 1104 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) 1105 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) 1106 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) 1107 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) 1108 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) 1109 1110 def test_sre_byte_literals(self): 1111 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1112 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) 1113 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) 1114 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) 1115 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) 1116 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) 1117 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) 1118 self.assertRaises(re.error, re.compile, br"\u1234") 1119 self.assertRaises(re.error, re.compile, br"\U00012345") 1120 self.assertTrue(re.match(br"\0", b"\000")) 1121 self.assertTrue(re.match(br"\08", b"\0008")) 1122 self.assertTrue(re.match(br"\01", b"\001")) 1123 self.assertTrue(re.match(br"\018", b"\0018")) 1124 self.checkPatternError(br"\567", 1125 r'octal escape value \567 outside of ' 1126 r'range 0-0o377', 0) 1127 self.checkPatternError(br"\911", 'invalid group reference 91', 1) 1128 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) 1129 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) 1130 1131 def test_sre_byte_class_literals(self): 1132 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1133 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i]))) 1134 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i]))) 1135 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i]))) 1136 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) 1137 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) 1138 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) 1139 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) 1140 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) 1141 self.assertRaises(re.error, re.compile, br"[\u1234]") 1142 self.assertRaises(re.error, re.compile, br"[\U00012345]") 1143 self.checkPatternError(br"[\567]", 1144 r'octal escape value \567 outside of ' 1145 r'range 0-0o377', 1) 1146 self.checkPatternError(br"[\911]", r'bad escape \9', 1) 1147 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) 1148 1149 def test_character_set_errors(self): 1150 self.checkPatternError(r'[', 'unterminated character set', 0) 1151 self.checkPatternError(r'[^', 'unterminated character set', 0) 1152 self.checkPatternError(r'[a', 'unterminated character set', 0) 1153 # bug 545855 -- This pattern failed to cause a compile error as it 1154 # should, instead provoking a TypeError. 1155 self.checkPatternError(r"[a-", 'unterminated character set', 0) 1156 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) 1157 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) 1158 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) 1159 1160 def test_bug_113254(self): 1161 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 1162 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 1163 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 1164 1165 def test_bug_527371(self): 1166 # bug described in patches 527371/672491 1167 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 1168 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 1169 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 1170 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a') 1171 self.assertEqual(re.match(r"((a))", "a").lastindex, 1) 1172 1173 def test_bug_418626(self): 1174 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 1175 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 1176 # pattern '*?' on a long string. 1177 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 1178 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 1179 20003) 1180 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 1181 # non-simple '*?' still used to hit the recursion limit, before the 1182 # non-recursive scheme was implemented. 1183 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 1184 1185 def test_bug_612074(self): 1186 pat="["+re.escape("\u2039")+"]" 1187 self.assertEqual(re.compile(pat) and 1, 1) 1188 1189 def test_stack_overflow(self): 1190 # nasty cases that used to overflow the straightforward recursive 1191 # implementation of repeated groups. 1192 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 1193 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 1194 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 1195 1196 def test_nothing_to_repeat(self): 1197 for reps in '*', '+', '?', '{1,2}': 1198 for mod in '', '?': 1199 self.checkPatternError('%s%s' % (reps, mod), 1200 'nothing to repeat', 0) 1201 self.checkPatternError('(?:%s%s)' % (reps, mod), 1202 'nothing to repeat', 3) 1203 1204 def test_multiple_repeat(self): 1205 for outer_reps in '*', '+', '{1,2}': 1206 for outer_mod in '', '?': 1207 outer_op = outer_reps + outer_mod 1208 for inner_reps in '*', '+', '?', '{1,2}': 1209 for inner_mod in '', '?': 1210 inner_op = inner_reps + inner_mod 1211 self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 1212 'multiple repeat', 1 + len(inner_op)) 1213 1214 def test_unlimited_zero_width_repeat(self): 1215 # Issue #9669 1216 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 1217 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 1218 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 1219 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 1220 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 1221 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 1222 1223 def test_scanner(self): 1224 def s_ident(scanner, token): return token 1225 def s_operator(scanner, token): return "op%s" % token 1226 def s_float(scanner, token): return float(token) 1227 def s_int(scanner, token): return int(token) 1228 1229 scanner = Scanner([ 1230 (r"[a-zA-Z_]\w*", s_ident), 1231 (r"\d+\.\d*", s_float), 1232 (r"\d+", s_int), 1233 (r"=|\+|-|\*|/", s_operator), 1234 (r"\s+", None), 1235 ]) 1236 1237 self.assertTrue(scanner.scanner.scanner("").pattern) 1238 1239 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 1240 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 1241 'op+', 'bar'], '')) 1242 1243 def test_bug_448951(self): 1244 # bug 448951 (similar to 429357, but with single char match) 1245 # (Also test greedy matches.) 1246 for op in '','?','*': 1247 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 1248 (None, None)) 1249 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 1250 ('a:', 'a')) 1251 1252 def test_bug_725106(self): 1253 # capturing groups in alternatives in repeats 1254 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 1255 ('b', 'a')) 1256 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 1257 ('c', 'b')) 1258 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 1259 ('b', None)) 1260 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 1261 ('b', None)) 1262 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 1263 ('b', 'a')) 1264 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 1265 ('c', 'b')) 1266 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 1267 ('b', None)) 1268 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 1269 ('b', None)) 1270 1271 def test_bug_725149(self): 1272 # mark_stack_base restoring before restoring marks 1273 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 1274 ('a', None)) 1275 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 1276 ('a', None, None)) 1277 1278 def test_bug_764548(self): 1279 # bug 764548, re.compile() barfs on str/unicode subclasses 1280 class my_unicode(str): pass 1281 pat = re.compile(my_unicode("abc")) 1282 self.assertIsNone(pat.match("xyz")) 1283 1284 def test_finditer(self): 1285 iter = re.finditer(r":+", "a:b::c:::d") 1286 self.assertEqual([item.group(0) for item in iter], 1287 [":", "::", ":::"]) 1288 1289 pat = re.compile(r":+") 1290 iter = pat.finditer("a:b::c:::d", 1, 10) 1291 self.assertEqual([item.group(0) for item in iter], 1292 [":", "::", ":::"]) 1293 1294 pat = re.compile(r":+") 1295 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) 1296 self.assertEqual([item.group(0) for item in iter], 1297 [":", "::", ":::"]) 1298 1299 pat = re.compile(r":+") 1300 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) 1301 self.assertEqual([item.group(0) for item in iter], 1302 [":", "::", ":::"]) 1303 1304 pat = re.compile(r":+") 1305 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) 1306 self.assertEqual([item.group(0) for item in iter], 1307 ["::", "::"]) 1308 1309 def test_bug_926075(self): 1310 self.assertIsNot(re.compile('bug_926075'), 1311 re.compile(b'bug_926075')) 1312 1313 def test_bug_931848(self): 1314 pattern = "[\u002E\u3002\uFF0E\uFF61]" 1315 self.assertEqual(re.compile(pattern).split("a.b.c"), 1316 ['a','b','c']) 1317 1318 def test_bug_581080(self): 1319 iter = re.finditer(r"\s", "a b") 1320 self.assertEqual(next(iter).span(), (1,2)) 1321 self.assertRaises(StopIteration, next, iter) 1322 1323 scanner = re.compile(r"\s").scanner("a b") 1324 self.assertEqual(scanner.search().span(), (1, 2)) 1325 self.assertIsNone(scanner.search()) 1326 1327 def test_bug_817234(self): 1328 iter = re.finditer(r".*", "asdf") 1329 self.assertEqual(next(iter).span(), (0, 4)) 1330 self.assertEqual(next(iter).span(), (4, 4)) 1331 self.assertRaises(StopIteration, next, iter) 1332 1333 def test_bug_6561(self): 1334 # '\d' should match characters in Unicode category 'Nd' 1335 # (Number, Decimal Digit), but not those in 'Nl' (Number, 1336 # Letter) or 'No' (Number, Other). 1337 decimal_digits = [ 1338 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 1339 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 1340 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 1341 ] 1342 for x in decimal_digits: 1343 self.assertEqual(re.match(r'^\d$', x).group(0), x) 1344 1345 not_decimal_digits = [ 1346 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 1347 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 1348 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 1349 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 1350 ] 1351 for x in not_decimal_digits: 1352 self.assertIsNone(re.match(r'^\d$', x)) 1353 1354 def test_empty_array(self): 1355 # SF buf 1647541 1356 import array 1357 for typecode in 'bBuhHiIlLfd': 1358 a = array.array(typecode) 1359 self.assertIsNone(re.compile(b"bla").match(a)) 1360 self.assertEqual(re.compile(b"").match(a).groups(), ()) 1361 1362 def test_inline_flags(self): 1363 # Bug #1700 1364 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below 1365 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below 1366 1367 p = re.compile('.' + upper_char, re.I | re.S) 1368 q = p.match('\n' + lower_char) 1369 self.assertTrue(q) 1370 1371 p = re.compile('.' + lower_char, re.I | re.S) 1372 q = p.match('\n' + upper_char) 1373 self.assertTrue(q) 1374 1375 p = re.compile('(?i).' + upper_char, re.S) 1376 q = p.match('\n' + lower_char) 1377 self.assertTrue(q) 1378 1379 p = re.compile('(?i).' + lower_char, re.S) 1380 q = p.match('\n' + upper_char) 1381 self.assertTrue(q) 1382 1383 p = re.compile('(?is).' + upper_char) 1384 q = p.match('\n' + lower_char) 1385 self.assertTrue(q) 1386 1387 p = re.compile('(?is).' + lower_char) 1388 q = p.match('\n' + upper_char) 1389 self.assertTrue(q) 1390 1391 p = re.compile('(?s)(?i).' + upper_char) 1392 q = p.match('\n' + lower_char) 1393 self.assertTrue(q) 1394 1395 p = re.compile('(?s)(?i).' + lower_char) 1396 q = p.match('\n' + upper_char) 1397 self.assertTrue(q) 1398 1399 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char)) 1400 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char)) 1401 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X)) 1402 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char)) 1403 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X)) 1404 1405 p = upper_char + '(?i)' 1406 with self.assertWarns(DeprecationWarning) as warns: 1407 self.assertTrue(re.match(p, lower_char)) 1408 self.assertEqual( 1409 str(warns.warnings[0].message), 1410 'Flags not at the start of the expression %r' % p 1411 ) 1412 self.assertEqual(warns.warnings[0].filename, __file__) 1413 1414 p = upper_char + '(?i)%s' % ('.?' * 100) 1415 with self.assertWarns(DeprecationWarning) as warns: 1416 self.assertTrue(re.match(p, lower_char)) 1417 self.assertEqual( 1418 str(warns.warnings[0].message), 1419 'Flags not at the start of the expression %r (truncated)' % p[:20] 1420 ) 1421 self.assertEqual(warns.warnings[0].filename, __file__) 1422 1423 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning 1424 with warnings.catch_warnings(): 1425 warnings.simplefilter('error', BytesWarning) 1426 p = b'A(?i)' 1427 with self.assertWarns(DeprecationWarning) as warns: 1428 self.assertTrue(re.match(p, b'a')) 1429 self.assertEqual( 1430 str(warns.warnings[0].message), 1431 'Flags not at the start of the expression %r' % p 1432 ) 1433 self.assertEqual(warns.warnings[0].filename, __file__) 1434 1435 with self.assertWarns(DeprecationWarning): 1436 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char)) 1437 with self.assertWarns(DeprecationWarning): 1438 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char)) 1439 with self.assertWarns(DeprecationWarning): 1440 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char)) 1441 with self.assertWarns(DeprecationWarning): 1442 self.assertTrue(re.match('^(?i)' + upper_char, lower_char)) 1443 with self.assertWarns(DeprecationWarning): 1444 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char)) 1445 with self.assertWarns(DeprecationWarning) as warns: 1446 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char)) 1447 self.assertRegex(str(warns.warnings[0].message), 1448 'Flags not at the start') 1449 self.assertEqual(warns.warnings[0].filename, __file__) 1450 with self.assertWarns(DeprecationWarning) as warns: 1451 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')', 1452 lower_char)) 1453 self.assertRegex(str(warns.warnings[0].message), 1454 'Flags not at the start') 1455 self.assertEqual(warns.warnings[0].filename, __file__) 1456 with self.assertWarns(DeprecationWarning) as warns: 1457 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')', 1458 lower_char)) 1459 self.assertRegex(str(warns.warnings[0].message), 1460 'Flags not at the start') 1461 self.assertEqual(warns.warnings[0].filename, __file__) 1462 1463 1464 def test_dollar_matches_twice(self): 1465 "$ matches the end of string, and just before the terminating \n" 1466 pattern = re.compile('$') 1467 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 1468 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 1469 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1470 1471 pattern = re.compile('$', re.MULTILINE) 1472 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1473 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1474 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1475 1476 def test_bytes_str_mixing(self): 1477 # Mixing str and bytes is disallowed 1478 pat = re.compile('.') 1479 bpat = re.compile(b'.') 1480 self.assertRaises(TypeError, pat.match, b'b') 1481 self.assertRaises(TypeError, bpat.match, 'b') 1482 self.assertRaises(TypeError, pat.sub, b'b', 'c') 1483 self.assertRaises(TypeError, pat.sub, 'b', b'c') 1484 self.assertRaises(TypeError, pat.sub, b'b', b'c') 1485 self.assertRaises(TypeError, bpat.sub, b'b', 'c') 1486 self.assertRaises(TypeError, bpat.sub, 'b', b'c') 1487 self.assertRaises(TypeError, bpat.sub, 'b', 'c') 1488 1489 def test_ascii_and_unicode_flag(self): 1490 # String patterns 1491 for flags in (0, re.UNICODE): 1492 pat = re.compile('\xc0', flags | re.IGNORECASE) 1493 self.assertTrue(pat.match('\xe0')) 1494 pat = re.compile(r'\w', flags) 1495 self.assertTrue(pat.match('\xe0')) 1496 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) 1497 self.assertIsNone(pat.match('\xe0')) 1498 pat = re.compile('(?a)\xc0', re.IGNORECASE) 1499 self.assertIsNone(pat.match('\xe0')) 1500 pat = re.compile(r'\w', re.ASCII) 1501 self.assertIsNone(pat.match('\xe0')) 1502 pat = re.compile(r'(?a)\w') 1503 self.assertIsNone(pat.match('\xe0')) 1504 # Bytes patterns 1505 for flags in (0, re.ASCII): 1506 pat = re.compile(b'\xc0', flags | re.IGNORECASE) 1507 self.assertIsNone(pat.match(b'\xe0')) 1508 pat = re.compile(br'\w', flags) 1509 self.assertIsNone(pat.match(b'\xe0')) 1510 # Incompatibilities 1511 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) 1512 self.assertRaises(re.error, re.compile, br'(?u)\w') 1513 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) 1514 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) 1515 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) 1516 self.assertRaises(re.error, re.compile, r'(?au)\w') 1517 1518 def test_locale_flag(self): 1519 # On Windows, Python 3.7 doesn't call setlocale(LC_CTYPE, "") at 1520 # startup and so the LC_CTYPE locale uses Latin1 encoding by default, 1521 # whereas getpreferredencoding() returns the ANSI code page. Set 1522 # temporarily the LC_CTYPE locale to the user preferred encoding to 1523 # ensure that it uses the ANSI code page. 1524 oldloc = locale.setlocale(locale.LC_CTYPE, None) 1525 locale.setlocale(locale.LC_CTYPE, "") 1526 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldloc) 1527 1528 # Get the current locale encoding 1529 enc = locale.getpreferredencoding(False) 1530 1531 # Search non-ASCII letter 1532 for i in range(128, 256): 1533 try: 1534 c = bytes([i]).decode(enc) 1535 sletter = c.lower() 1536 if sletter == c: continue 1537 bletter = sletter.encode(enc) 1538 if len(bletter) != 1: continue 1539 if bletter.decode(enc) != sletter: continue 1540 bpat = re.escape(bytes([i])) 1541 break 1542 except (UnicodeError, TypeError): 1543 pass 1544 else: 1545 bletter = None 1546 bpat = b'A' 1547 # Bytes patterns 1548 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) 1549 if bletter: 1550 self.assertTrue(pat.match(bletter)) 1551 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) 1552 if bletter: 1553 self.assertTrue(pat.match(bletter)) 1554 pat = re.compile(bpat, re.IGNORECASE) 1555 if bletter: 1556 self.assertIsNone(pat.match(bletter)) 1557 pat = re.compile(br'\w', re.LOCALE) 1558 if bletter: 1559 self.assertTrue(pat.match(bletter)) 1560 pat = re.compile(br'(?L)\w') 1561 if bletter: 1562 self.assertTrue(pat.match(bletter)) 1563 pat = re.compile(br'\w') 1564 if bletter: 1565 self.assertIsNone(pat.match(bletter)) 1566 # Incompatibilities 1567 self.assertRaises(ValueError, re.compile, '', re.LOCALE) 1568 self.assertRaises(re.error, re.compile, '(?L)') 1569 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) 1570 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) 1571 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) 1572 self.assertRaises(re.error, re.compile, b'(?aL)') 1573 1574 def test_scoped_flags(self): 1575 self.assertTrue(re.match(r'(?i:a)b', 'Ab')) 1576 self.assertIsNone(re.match(r'(?i:a)b', 'aB')) 1577 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) 1578 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) 1579 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) 1580 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) 1581 1582 self.assertTrue(re.match(r'(?x: a) b', 'a b')) 1583 self.assertIsNone(re.match(r'(?x: a) b', ' a b')) 1584 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) 1585 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) 1586 1587 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) 1588 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) 1589 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) 1590 1591 self.checkPatternError(r'(?a)(?-a:\w)', 1592 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) 1593 self.checkPatternError(r'(?i-i:a)', 1594 'bad inline flags: flag turned on and off', 5) 1595 self.checkPatternError(r'(?au:a)', 1596 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1597 self.checkPatternError(br'(?aL:a)', 1598 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1599 1600 self.checkPatternError(r'(?-', 'missing flag', 3) 1601 self.checkPatternError(r'(?-+', 'missing flag', 3) 1602 self.checkPatternError(r'(?-z', 'unknown flag', 3) 1603 self.checkPatternError(r'(?-i', 'missing :', 4) 1604 self.checkPatternError(r'(?-i)', 'missing :', 4) 1605 self.checkPatternError(r'(?-i+', 'missing :', 4) 1606 self.checkPatternError(r'(?-iz', 'unknown flag', 4) 1607 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) 1608 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1609 self.checkPatternError(r'(?i+', 'missing -, : or )', 3) 1610 self.checkPatternError(r'(?iz', 'unknown flag', 3) 1611 1612 def test_bug_6509(self): 1613 # Replacement strings of both types must parse properly. 1614 # all strings 1615 pat = re.compile(r'a(\w)') 1616 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') 1617 pat = re.compile('a(.)') 1618 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') 1619 pat = re.compile('..') 1620 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') 1621 1622 # all bytes 1623 pat = re.compile(br'a(\w)') 1624 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') 1625 pat = re.compile(b'a(.)') 1626 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') 1627 pat = re.compile(b'..') 1628 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') 1629 1630 def test_dealloc(self): 1631 # issue 3299: check for segfault in debug build 1632 import _sre 1633 # the overflow limit is different on wide and narrow builds and it 1634 # depends on the definition of SRE_CODE (see sre.h). 1635 # 2**128 should be big enough to overflow on both. For smaller values 1636 # a RuntimeError is raised instead of OverflowError. 1637 long_overflow = 2**128 1638 self.assertRaises(TypeError, re.finditer, "a", {}) 1639 with self.assertRaises(OverflowError): 1640 _sre.compile("abc", 0, [long_overflow], 0, {}, ()) 1641 with self.assertRaises(TypeError): 1642 _sre.compile({}, 0, [], 0, [], []) 1643 1644 def test_search_dot_unicode(self): 1645 self.assertTrue(re.search("123.*-", '123abc-')) 1646 self.assertTrue(re.search("123.*-", '123\xe9-')) 1647 self.assertTrue(re.search("123.*-", '123\u20ac-')) 1648 self.assertTrue(re.search("123.*-", '123\U0010ffff-')) 1649 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) 1650 1651 def test_compile(self): 1652 # Test return value when given string and pattern as parameter 1653 pattern = re.compile('random pattern') 1654 self.assertIsInstance(pattern, re.Pattern) 1655 same_pattern = re.compile(pattern) 1656 self.assertIsInstance(same_pattern, re.Pattern) 1657 self.assertIs(same_pattern, pattern) 1658 # Test behaviour when not given a string or pattern as parameter 1659 self.assertRaises(TypeError, re.compile, 0) 1660 1661 @bigmemtest(size=_2G, memuse=1) 1662 def test_large_search(self, size): 1663 # Issue #10182: indices were 32-bit-truncated. 1664 s = 'a' * size 1665 m = re.search('$', s) 1666 self.assertIsNotNone(m) 1667 self.assertEqual(m.start(), size) 1668 self.assertEqual(m.end(), size) 1669 1670 # The huge memuse is because of re.sub() using a list and a join() 1671 # to create the replacement result. 1672 @bigmemtest(size=_2G, memuse=16 + 2) 1673 def test_large_subn(self, size): 1674 # Issue #10182: indices were 32-bit-truncated. 1675 s = 'a' * size 1676 r, n = re.subn('', '', s) 1677 self.assertEqual(r, s) 1678 self.assertEqual(n, size + 1) 1679 1680 def test_bug_16688(self): 1681 # Issue 16688: Backreferences make case-insensitive regex fail on 1682 # non-ASCII strings. 1683 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) 1684 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) 1685 1686 def test_repeat_minmax_overflow(self): 1687 # Issue #13169 1688 string = "x" * 100000 1689 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1690 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1691 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1692 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1693 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1694 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1695 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1696 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1697 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1698 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1699 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1700 1701 @cpython_only 1702 def test_repeat_minmax_overflow_maxrepeat(self): 1703 try: 1704 from _sre import MAXREPEAT 1705 except ImportError: 1706 self.skipTest('requires _sre.MAXREPEAT constant') 1707 string = "x" * 100000 1708 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1709 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1710 (0, 100000)) 1711 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1712 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1713 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1714 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1715 1716 def test_backref_group_name_in_exception(self): 1717 # Issue 17341: Poor error message when compiling invalid regex 1718 self.checkPatternError('(?P=<foo>)', 1719 "bad character in group name '<foo>'", 4) 1720 1721 def test_group_name_in_exception(self): 1722 # Issue 17341: Poor error message when compiling invalid regex 1723 self.checkPatternError('(?P<?foo>)', 1724 "bad character in group name '?foo'", 4) 1725 1726 def test_issue17998(self): 1727 for reps in '*', '+', '?', '{1}': 1728 for mod in '', '?': 1729 pattern = '.' + reps + mod + 'yz' 1730 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1731 ['xyz'], msg=pattern) 1732 pattern = pattern.encode() 1733 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), 1734 [b'xyz'], msg=pattern) 1735 1736 def test_match_repr(self): 1737 for string in '[abracadabra]', S('[abracadabra]'): 1738 m = re.search(r'(.+)(.*?)\1', string) 1739 self.assertEqual(repr(m), "<%s.%s object; " 1740 "span=(1, 12), match='abracadabra'>" % 1741 (type(m).__module__, type(m).__qualname__)) 1742 for string in (b'[abracadabra]', B(b'[abracadabra]'), 1743 bytearray(b'[abracadabra]'), 1744 memoryview(b'[abracadabra]')): 1745 m = re.search(br'(.+)(.*?)\1', string) 1746 self.assertEqual(repr(m), "<%s.%s object; " 1747 "span=(1, 12), match=b'abracadabra'>" % 1748 (type(m).__module__, type(m).__qualname__)) 1749 1750 first, second = list(re.finditer("(aa)|(bb)", "aa bb")) 1751 self.assertEqual(repr(first), "<%s.%s object; " 1752 "span=(0, 2), match='aa'>" % 1753 (type(second).__module__, type(first).__qualname__)) 1754 self.assertEqual(repr(second), "<%s.%s object; " 1755 "span=(3, 5), match='bb'>" % 1756 (type(second).__module__, type(second).__qualname__)) 1757 1758 def test_zerowidth(self): 1759 # Issues 852532, 1647489, 3262, 25054. 1760 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) 1761 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', '']) 1762 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc']) 1763 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', '']) 1764 1765 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-') 1766 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-') 1767 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]') 1768 1769 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', '']) 1770 self.assertEqual(re.findall(r"\b|\w+", "a::bc"), 1771 ['', 'a', '', '', 'bc', '']) 1772 1773 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")], 1774 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) 1775 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], 1776 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) 1777 1778 def test_bug_2537(self): 1779 # issue 2537: empty submatches 1780 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1781 for inner_op in ('{0,}', '*', '?'): 1782 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1783 m = r.match("xyyzy") 1784 self.assertEqual(m.group(0), "xyy") 1785 self.assertEqual(m.group(1), "") 1786 self.assertEqual(m.group(2), "y") 1787 1788 @cpython_only 1789 def test_debug_flag(self): 1790 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1791 with captured_stdout() as out: 1792 re.compile(pat, re.DEBUG) 1793 self.maxDiff = None 1794 dump = '''\ 1795 SUBPATTERN 1 0 0 1796 LITERAL 46 1797 BRANCH 1798 IN 1799 LITERAL 99 1800 LITERAL 104 1801 OR 1802 LITERAL 112 1803 LITERAL 121 1804 GROUPREF_EXISTS 1 1805 AT AT_END 1806 ELSE 1807 LITERAL 58 1808 LITERAL 32 1809 1810 0. INFO 8 0b1 2 5 (to 9) 1811 prefix_skip 0 1812 prefix [0x2e] ('.') 1813 overlap [0] 1814 9: MARK 0 1815 11. LITERAL 0x2e ('.') 1816 13. MARK 1 1817 15. BRANCH 10 (to 26) 1818 17. IN 6 (to 24) 1819 19. LITERAL 0x63 ('c') 1820 21. LITERAL 0x68 ('h') 1821 23. FAILURE 1822 24: JUMP 9 (to 34) 1823 26: branch 7 (to 33) 1824 27. LITERAL 0x70 ('p') 1825 29. LITERAL 0x79 ('y') 1826 31. JUMP 2 (to 34) 1827 33: FAILURE 1828 34: GROUPREF_EXISTS 0 6 (to 41) 1829 37. AT END 1830 39. JUMP 5 (to 45) 1831 41: LITERAL 0x3a (':') 1832 43. LITERAL 0x20 (' ') 1833 45: SUCCESS 1834 ''' 1835 self.assertEqual(out.getvalue(), dump) 1836 # Debug output is output again even a second time (bypassing 1837 # the cache -- issue #20426). 1838 with captured_stdout() as out: 1839 re.compile(pat, re.DEBUG) 1840 self.assertEqual(out.getvalue(), dump) 1841 1842 def test_keyword_parameters(self): 1843 # Issue #20283: Accepting the string keyword parameter. 1844 pat = re.compile(r'(ab)') 1845 self.assertEqual( 1846 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1847 self.assertEqual( 1848 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9)) 1849 self.assertEqual( 1850 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1851 self.assertEqual( 1852 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1853 self.assertEqual( 1854 pat.split(string='abracadabra', maxsplit=1), 1855 ['', 'ab', 'racadabra']) 1856 self.assertEqual( 1857 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), 1858 (7, 9)) 1859 1860 def test_bug_20998(self): 1861 # Issue #20998: Fullmatch of repeated single character pattern 1862 # with ignore case. 1863 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) 1864 1865 def test_locale_caching(self): 1866 # Issue #22410 1867 oldlocale = locale.setlocale(locale.LC_CTYPE) 1868 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1869 for loc in 'en_US.iso88591', 'en_US.utf8': 1870 try: 1871 locale.setlocale(locale.LC_CTYPE, loc) 1872 except locale.Error: 1873 # Unsupported locale on this system 1874 self.skipTest('test needs %s locale' % loc) 1875 1876 re.purge() 1877 self.check_en_US_iso88591() 1878 self.check_en_US_utf8() 1879 re.purge() 1880 self.check_en_US_utf8() 1881 self.check_en_US_iso88591() 1882 1883 def check_en_US_iso88591(self): 1884 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1885 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1886 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1887 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1888 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1889 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1890 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1891 1892 def check_en_US_utf8(self): 1893 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1894 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1895 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1896 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1897 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1898 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1899 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1900 1901 def test_locale_compiled(self): 1902 oldlocale = locale.setlocale(locale.LC_CTYPE) 1903 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1904 for loc in 'en_US.iso88591', 'en_US.utf8': 1905 try: 1906 locale.setlocale(locale.LC_CTYPE, loc) 1907 except locale.Error: 1908 # Unsupported locale on this system 1909 self.skipTest('test needs %s locale' % loc) 1910 1911 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1912 p1 = re.compile(b'\xc5\xe5', re.L|re.I) 1913 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) 1914 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) 1915 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) 1916 for p in p1, p2, p3: 1917 self.assertTrue(p.match(b'\xc5\xe5')) 1918 self.assertTrue(p.match(b'\xe5\xe5')) 1919 self.assertTrue(p.match(b'\xc5\xc5')) 1920 self.assertIsNone(p4.match(b'\xe5\xc5')) 1921 self.assertIsNone(p4.match(b'\xe5\xe5')) 1922 self.assertIsNone(p4.match(b'\xc5\xc5')) 1923 1924 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1925 for p in p1, p2, p3: 1926 self.assertTrue(p.match(b'\xc5\xe5')) 1927 self.assertIsNone(p.match(b'\xe5\xe5')) 1928 self.assertIsNone(p.match(b'\xc5\xc5')) 1929 self.assertTrue(p4.match(b'\xe5\xc5')) 1930 self.assertIsNone(p4.match(b'\xe5\xe5')) 1931 self.assertIsNone(p4.match(b'\xc5\xc5')) 1932 1933 def test_error(self): 1934 with self.assertRaises(re.error) as cm: 1935 re.compile('(\u20ac))') 1936 err = cm.exception 1937 self.assertIsInstance(err.pattern, str) 1938 self.assertEqual(err.pattern, '(\u20ac))') 1939 self.assertEqual(err.pos, 3) 1940 self.assertEqual(err.lineno, 1) 1941 self.assertEqual(err.colno, 4) 1942 self.assertIn(err.msg, str(err)) 1943 self.assertIn(' at position 3', str(err)) 1944 self.assertNotIn(' at position 3', err.msg) 1945 # Bytes pattern 1946 with self.assertRaises(re.error) as cm: 1947 re.compile(b'(\xa4))') 1948 err = cm.exception 1949 self.assertIsInstance(err.pattern, bytes) 1950 self.assertEqual(err.pattern, b'(\xa4))') 1951 self.assertEqual(err.pos, 3) 1952 # Multiline pattern 1953 with self.assertRaises(re.error) as cm: 1954 re.compile(""" 1955 ( 1956 abc 1957 ) 1958 ) 1959 ( 1960 """, re.VERBOSE) 1961 err = cm.exception 1962 self.assertEqual(err.pos, 77) 1963 self.assertEqual(err.lineno, 5) 1964 self.assertEqual(err.colno, 17) 1965 self.assertIn(err.msg, str(err)) 1966 self.assertIn(' at position 77', str(err)) 1967 self.assertIn('(line 5, column 17)', str(err)) 1968 1969 def test_misc_errors(self): 1970 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) 1971 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) 1972 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) 1973 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) 1974 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) 1975 self.checkPatternError(r'(?iz)', 'unknown flag', 3) 1976 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1977 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) 1978 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) 1979 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) 1980 self.checkPatternError(r'(?', 'unexpected end of pattern', 2) 1981 1982 def test_enum(self): 1983 # Issue #28082: Check that str(flag) returns a human readable string 1984 # instead of an integer 1985 self.assertIn('ASCII', str(re.A)) 1986 self.assertIn('DOTALL', str(re.S)) 1987 1988 def test_pattern_compare(self): 1989 pattern1 = re.compile('abc', re.IGNORECASE) 1990 1991 # equal to itself 1992 self.assertEqual(pattern1, pattern1) 1993 self.assertFalse(pattern1 != pattern1) 1994 1995 # equal 1996 re.purge() 1997 pattern2 = re.compile('abc', re.IGNORECASE) 1998 self.assertEqual(hash(pattern2), hash(pattern1)) 1999 self.assertEqual(pattern2, pattern1) 2000 2001 # not equal: different pattern 2002 re.purge() 2003 pattern3 = re.compile('XYZ', re.IGNORECASE) 2004 # Don't test hash(pattern3) != hash(pattern1) because there is no 2005 # warranty that hash values are different 2006 self.assertNotEqual(pattern3, pattern1) 2007 2008 # not equal: different flag (flags=0) 2009 re.purge() 2010 pattern4 = re.compile('abc') 2011 self.assertNotEqual(pattern4, pattern1) 2012 2013 # only == and != comparison operators are supported 2014 with self.assertRaises(TypeError): 2015 pattern1 < pattern2 2016 2017 def test_pattern_compare_bytes(self): 2018 pattern1 = re.compile(b'abc') 2019 2020 # equal: test bytes patterns 2021 re.purge() 2022 pattern2 = re.compile(b'abc') 2023 self.assertEqual(hash(pattern2), hash(pattern1)) 2024 self.assertEqual(pattern2, pattern1) 2025 2026 # not equal: pattern of a different types (str vs bytes), 2027 # comparison must not raise a BytesWarning 2028 re.purge() 2029 pattern3 = re.compile('abc') 2030 with warnings.catch_warnings(): 2031 warnings.simplefilter('error', BytesWarning) 2032 self.assertNotEqual(pattern3, pattern1) 2033 2034 def test_bug_29444(self): 2035 s = bytearray(b'abcdefgh') 2036 m = re.search(b'[a-h]+', s) 2037 m2 = re.search(b'[e-h]+', s) 2038 self.assertEqual(m.group(), b'abcdefgh') 2039 self.assertEqual(m2.group(), b'efgh') 2040 s[:] = b'xyz' 2041 self.assertEqual(m.group(), b'xyz') 2042 self.assertEqual(m2.group(), b'') 2043 2044 def test_bug_34294(self): 2045 # Issue 34294: wrong capturing groups 2046 2047 # exists since Python 2 2048 s = "a\tx" 2049 p = r"\b(?=(\t)|(x))x" 2050 self.assertEqual(re.search(p, s).groups(), (None, 'x')) 2051 2052 # introduced in Python 3.7.0 2053 s = "ab" 2054 p = r"(?=(.)(.)?)" 2055 self.assertEqual(re.findall(p, s), 2056 [('a', 'b'), ('b', '')]) 2057 self.assertEqual([m.groups() for m in re.finditer(p, s)], 2058 [('a', 'b'), ('b', None)]) 2059 2060 # test-cases provided by issue34294, introduced in Python 3.7.0 2061 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)" 2062 s = "<test><foo2/></test>" 2063 self.assertEqual(re.findall(p, s), 2064 [('test', '<foo2/>'), ('foo2', '')]) 2065 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2066 [{'tag': 'test', 'text': '<foo2/>'}, 2067 {'tag': 'foo2', 'text': None}]) 2068 s = "<test>Hello</test><foo/>" 2069 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2070 [{'tag': 'test', 'text': 'Hello'}, 2071 {'tag': 'foo', 'text': None}]) 2072 s = "<test>Hello</test><foo/><foo/>" 2073 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2074 [{'tag': 'test', 'text': 'Hello'}, 2075 {'tag': 'foo', 'text': None}, 2076 {'tag': 'foo', 'text': None}]) 2077 2078 2079 class PatternReprTests(unittest.TestCase): 2080 def check(self, pattern, expected): 2081 self.assertEqual(repr(re.compile(pattern)), expected) 2082 2083 def check_flags(self, pattern, flags, expected): 2084 self.assertEqual(repr(re.compile(pattern, flags)), expected) 2085 2086 def test_without_flags(self): 2087 self.check('random pattern', 2088 "re.compile('random pattern')") 2089 2090 def test_single_flag(self): 2091 self.check_flags('random pattern', re.IGNORECASE, 2092 "re.compile('random pattern', re.IGNORECASE)") 2093 2094 def test_multiple_flags(self): 2095 self.check_flags('random pattern', re.I|re.S|re.X, 2096 "re.compile('random pattern', " 2097 "re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2098 2099 def test_unicode_flag(self): 2100 self.check_flags('random pattern', re.U, 2101 "re.compile('random pattern')") 2102 self.check_flags('random pattern', re.I|re.S|re.U, 2103 "re.compile('random pattern', " 2104 "re.IGNORECASE|re.DOTALL)") 2105 2106 def test_inline_flags(self): 2107 self.check('(?i)pattern', 2108 "re.compile('(?i)pattern', re.IGNORECASE)") 2109 2110 def test_unknown_flags(self): 2111 self.check_flags('random pattern', 0x123000, 2112 "re.compile('random pattern', 0x123000)") 2113 self.check_flags('random pattern', 0x123000|re.I, 2114 "re.compile('random pattern', re.IGNORECASE|0x123000)") 2115 2116 def test_bytes(self): 2117 self.check(b'bytes pattern', 2118 "re.compile(b'bytes pattern')") 2119 self.check_flags(b'bytes pattern', re.A, 2120 "re.compile(b'bytes pattern', re.ASCII)") 2121 2122 def test_locale(self): 2123 self.check_flags(b'bytes pattern', re.L, 2124 "re.compile(b'bytes pattern', re.LOCALE)") 2125 2126 def test_quotes(self): 2127 self.check('random "double quoted" pattern', 2128 '''re.compile('random "double quoted" pattern')''') 2129 self.check("random 'single quoted' pattern", 2130 '''re.compile("random 'single quoted' pattern")''') 2131 self.check('''both 'single' and "double" quotes''', 2132 '''re.compile('both \\'single\\' and "double" quotes')''') 2133 2134 def test_long_pattern(self): 2135 pattern = 'Very %spattern' % ('long ' * 1000) 2136 r = repr(re.compile(pattern)) 2137 self.assertLess(len(r), 300) 2138 self.assertEqual(r[:30], "re.compile('Very long long lon") 2139 r = repr(re.compile(pattern, re.I)) 2140 self.assertLess(len(r), 300) 2141 self.assertEqual(r[:30], "re.compile('Very long long lon") 2142 self.assertEqual(r[-16:], ", re.IGNORECASE)") 2143 2144 2145 class ImplementationTest(unittest.TestCase): 2146 """ 2147 Test implementation details of the re module. 2148 """ 2149 2150 def test_overlap_table(self): 2151 f = sre_compile._generate_overlap_table 2152 self.assertEqual(f(""), []) 2153 self.assertEqual(f("a"), [0]) 2154 self.assertEqual(f("abcd"), [0, 0, 0, 0]) 2155 self.assertEqual(f("aaaa"), [0, 1, 2, 3]) 2156 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) 2157 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) 2158 2159 2160 class ExternalTests(unittest.TestCase): 2161 2162 def test_re_benchmarks(self): 2163 're_tests benchmarks' 2164 from test.re_tests import benchmarks 2165 for pattern, s in benchmarks: 2166 with self.subTest(pattern=pattern, string=s): 2167 p = re.compile(pattern) 2168 self.assertTrue(p.search(s)) 2169 self.assertTrue(p.match(s)) 2170 self.assertTrue(p.fullmatch(s)) 2171 s2 = ' '*10000 + s + ' '*10000 2172 self.assertTrue(p.search(s2)) 2173 self.assertTrue(p.match(s2, 10000)) 2174 self.assertTrue(p.match(s2, 10000, 10000 + len(s))) 2175 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s))) 2176 2177 def test_re_tests(self): 2178 're_tests test suite' 2179 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 2180 for t in tests: 2181 pattern = s = outcome = repl = expected = None 2182 if len(t) == 5: 2183 pattern, s, outcome, repl, expected = t 2184 elif len(t) == 3: 2185 pattern, s, outcome = t 2186 else: 2187 raise ValueError('Test tuples should have 3 or 5 fields', t) 2188 2189 with self.subTest(pattern=pattern, string=s): 2190 if outcome == SYNTAX_ERROR: # Expected a syntax error 2191 with self.assertRaises(re.error): 2192 re.compile(pattern) 2193 continue 2194 2195 obj = re.compile(pattern) 2196 result = obj.search(s) 2197 if outcome == FAIL: 2198 self.assertIsNone(result, 'Succeeded incorrectly') 2199 continue 2200 2201 with self.subTest(): 2202 self.assertTrue(result, 'Failed incorrectly') 2203 # Matched, as expected, so now we compute the 2204 # result string and compare it to our expected result. 2205 start, end = result.span(0) 2206 vardict = {'found': result.group(0), 2207 'groups': result.group(), 2208 'flags': result.re.flags} 2209 for i in range(1, 100): 2210 try: 2211 gi = result.group(i) 2212 # Special hack because else the string concat fails: 2213 if gi is None: 2214 gi = "None" 2215 except IndexError: 2216 gi = "Error" 2217 vardict['g%d' % i] = gi 2218 for i in result.re.groupindex.keys(): 2219 try: 2220 gi = result.group(i) 2221 if gi is None: 2222 gi = "None" 2223 except IndexError: 2224 gi = "Error" 2225 vardict[i] = gi 2226 self.assertEqual(eval(repl, vardict), expected, 2227 'grouping error') 2228 2229 # Try the match with both pattern and string converted to 2230 # bytes, and check that it still succeeds. 2231 try: 2232 bpat = bytes(pattern, "ascii") 2233 bs = bytes(s, "ascii") 2234 except UnicodeEncodeError: 2235 # skip non-ascii tests 2236 pass 2237 else: 2238 with self.subTest('bytes pattern match'): 2239 obj = re.compile(bpat) 2240 self.assertTrue(obj.search(bs)) 2241 2242 # Try the match with LOCALE enabled, and check that it 2243 # still succeeds. 2244 with self.subTest('locale-sensitive match'): 2245 obj = re.compile(bpat, re.LOCALE) 2246 result = obj.search(bs) 2247 if result is None: 2248 print('=== Fails on locale-sensitive match', t) 2249 2250 # Try the match with the search area limited to the extent 2251 # of the match and see if it still succeeds. \B will 2252 # break (because it won't match at the end or start of a 2253 # string), so we'll ignore patterns that feature it. 2254 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B' 2255 and result is not None): 2256 with self.subTest('range-limited match'): 2257 obj = re.compile(pattern) 2258 self.assertTrue(obj.search(s, start, end + 1)) 2259 2260 # Try the match with IGNORECASE enabled, and check that it 2261 # still succeeds. 2262 with self.subTest('case-insensitive match'): 2263 obj = re.compile(pattern, re.IGNORECASE) 2264 self.assertTrue(obj.search(s)) 2265 2266 # Try the match with UNICODE locale enabled, and check 2267 # that it still succeeds. 2268 with self.subTest('unicode-sensitive match'): 2269 obj = re.compile(pattern, re.UNICODE) 2270 self.assertTrue(obj.search(s)) 2271 2272 2273 if __name__ == "__main__": 2274 unittest.main() 2275