Home | History | Annotate | Download | only in test
      1 from test.support import (gc_collect, bigmemtest, _2G,
      2                           cpython_only, captured_stdout)
      3 import locale
      4 import re
      5 import sre_compile
      6 import string
      7 import unittest
      8 import warnings
      9 from re import Scanner
     10 from weakref import proxy
     11 
     12 # Misc tests from Tim Peters' re.doc
     13 
     14 # WARNING: Don't change details in these tests if you don't know
     15 # what you're doing. Some of these tests were carefully modeled to
     16 # cover most of the code.
     17 
     18 class S(str):
     19     def __getitem__(self, index):
     20         return S(super().__getitem__(index))
     21 
     22 class B(bytes):
     23     def __getitem__(self, index):
     24         return B(super().__getitem__(index))
     25 
     26 class ReTests(unittest.TestCase):
     27 
     28     def assertTypedEqual(self, actual, expect, msg=None):
     29         self.assertEqual(actual, expect, msg)
     30         def recurse(actual, expect):
     31             if isinstance(expect, (tuple, list)):
     32                 for x, y in zip(actual, expect):
     33                     recurse(x, y)
     34             else:
     35                 self.assertIs(type(actual), type(expect), msg)
     36         recurse(actual, expect)
     37 
     38     def checkPatternError(self, pattern, errmsg, pos=None):
     39         with self.assertRaises(re.error) as cm:
     40             re.compile(pattern)
     41         with self.subTest(pattern=pattern):
     42             err = cm.exception
     43             self.assertEqual(err.msg, errmsg)
     44             if pos is not None:
     45                 self.assertEqual(err.pos, pos)
     46 
     47     def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
     48         with self.assertRaises(re.error) as cm:
     49             re.sub(pattern, repl, string)
     50         with self.subTest(pattern=pattern, repl=repl):
     51             err = cm.exception
     52             self.assertEqual(err.msg, errmsg)
     53             if pos is not None:
     54                 self.assertEqual(err.pos, pos)
     55 
     56     def test_keep_buffer(self):
     57         # See bug 14212
     58         b = bytearray(b'x')
     59         it = re.finditer(b'a', b)
     60         with self.assertRaises(BufferError):
     61             b.extend(b'x'*400)
     62         list(it)
     63         del it
     64         gc_collect()
     65         b.extend(b'x'*400)
     66 
     67     def test_weakref(self):
     68         s = 'QabbbcR'
     69         x = re.compile('ab+c')
     70         y = proxy(x)
     71         self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
     72 
     73     def test_search_star_plus(self):
     74         self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
     75         self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
     76         self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
     77         self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
     78         self.assertIsNone(re.search('x', 'aaa'))
     79         self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
     80         self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
     81         self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
     82         self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
     83         self.assertIsNone(re.match('a+', 'xxx'))
     84 
     85     def bump_num(self, matchobj):
     86         int_value = int(matchobj.group(0))
     87         return str(int_value + 1)
     88 
     89     def test_basic_re_sub(self):
     90         self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
     91         self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
     92         self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
     93         self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
     94         self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
     95         self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
     96         for y in ("\xe0", "\u0430", "\U0001d49c"):
     97             self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
     98 
     99         self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
    100         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
    101                          '9.3 -3 24x100y')
    102         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
    103                          '9.3 -3 23x99y')
    104         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
    105                          '9.3 -3 23x99y')
    106 
    107         self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
    108         self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
    109 
    110         s = r"\1\1"
    111         self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
    112         self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
    113         self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
    114 
    115         self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
    116         self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
    117         self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
    118         self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
    119 
    120         self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
    121         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
    122         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
    123                          (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
    124         for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
    125             with self.subTest(c):
    126                 with self.assertRaises(re.error):
    127                     self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
    128 
    129         self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
    130 
    131     def test_bug_449964(self):
    132         # fails for group followed by other escape
    133         self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
    134                          'xx\bxx\b')
    135 
    136     def test_bug_449000(self):
    137         # Test for sub() on escaped characters
    138         self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
    139                          'abc\ndef\n')
    140         self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
    141                          'abc\ndef\n')
    142         self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
    143                          'abc\ndef\n')
    144         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
    145                          'abc\ndef\n')
    146 
    147     def test_bug_1661(self):
    148         # Verify that flags do not get silently ignored with compiled patterns
    149         pattern = re.compile('.')
    150         self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
    151         self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
    152         self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
    153         self.assertRaises(ValueError, re.compile, pattern, re.I)
    154 
    155     def test_bug_3629(self):
    156         # A regex that triggered a bug in the sre-code validator
    157         re.compile("(?P<quote>)(?(quote))")
    158 
    159     def test_sub_template_numeric_escape(self):
    160         # bug 776311 and friends
    161         self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
    162         self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
    163         self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
    164         self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
    165         self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
    166         self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
    167         self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
    168         self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
    169 
    170         self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
    171         self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
    172 
    173         self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
    174         self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
    175         self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
    176         self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
    177         self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
    178 
    179         self.checkTemplateError('x', r'\400', 'x',
    180                                 r'octal escape value \400 outside of '
    181                                 r'range 0-0o377', 0)
    182         self.checkTemplateError('x', r'\777', 'x',
    183                                 r'octal escape value \777 outside of '
    184                                 r'range 0-0o377', 0)
    185 
    186         self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
    187         self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
    188         self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
    189         self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
    190         self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
    191         self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
    192         self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
    193         self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
    194         self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
    195         self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
    196         self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
    197         self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
    198         self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
    199 
    200         # in python2.3 (etc), these loop endlessly in sre_parser.py
    201         self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
    202         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
    203                          'xz8')
    204         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
    205                          'xza')
    206 
    207     def test_qualified_re_sub(self):
    208         self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
    209         self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
    210         self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
    211 
    212     def test_bug_114660(self):
    213         self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
    214                          'hello there')
    215 
    216     def test_symbolic_groups(self):
    217         re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
    218         re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
    219         re.compile(r'(?P<a1>x)\1(?(1)y)')
    220         self.checkPatternError(r'(?P<a>)(?P<a>)',
    221                                "redefinition of group name 'a' as group 2; "
    222                                "was group 1")
    223         self.checkPatternError(r'(?P<a>(?P=a))',
    224                                "cannot refer to an open group", 10)
    225         self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
    226         self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
    227         self.checkPatternError(r'(?P=', 'missing group name', 4)
    228         self.checkPatternError(r'(?P=)', 'missing group name', 4)
    229         self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
    230         self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
    231         self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
    232         self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
    233         self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
    234         self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
    235         self.checkPatternError(r'(?P<', 'missing group name', 4)
    236         self.checkPatternError(r'(?P<>)', 'missing group name', 4)
    237         self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
    238         self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
    239         self.checkPatternError(r'(?(', 'missing group name', 3)
    240         self.checkPatternError(r'(?())', 'missing group name', 3)
    241         self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
    242         self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
    243         self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
    244         self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
    245         # New valid/invalid identifiers in Python 3
    246         re.compile('(?P<>x)(?P=)(?()y)')
    247         re.compile('(?P<>x)(?P=)(?()y)')
    248         self.checkPatternError('(?P<>x)', "bad character in group name ''", 4)
    249         # Support > 100 groups.
    250         pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
    251         pat = '(?:%s)(?(200)z|t)' % pat
    252         self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
    253 
    254     def test_symbolic_refs(self):
    255         self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
    256                                 'missing >, unterminated name', 3)
    257         self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
    258                                 'missing group name', 3)
    259         self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
    260         self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
    261                                 "bad character in group name 'a a'", 3)
    262         self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
    263                                 'missing group name', 3)
    264         self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
    265                                 "bad character in group name '1a1'", 3)
    266         self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
    267                                 'invalid group reference 2', 3)
    268         self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
    269                                 'invalid group reference 2', 1)
    270         with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
    271             re.sub('(?P<a>x)', r'\g<ab>', 'xx')
    272         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
    273         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
    274         self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
    275                                 "bad character in group name '-1'", 3)
    276         # New valid/invalid identifiers in Python 3
    277         self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx')
    278         self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx')
    279         self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
    280                                 "bad character in group name ''", 3)
    281         # Support > 100 groups.
    282         pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
    283         self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
    284 
    285     def test_re_subn(self):
    286         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
    287         self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
    288         self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
    289         self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
    290         self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
    291         self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
    292 
    293     def test_re_split(self):
    294         for string in ":a:b::c", S(":a:b::c"):
    295             self.assertTypedEqual(re.split(":", string),
    296                                   ['', 'a', 'b', '', 'c'])
    297             self.assertTypedEqual(re.split(":+", string),
    298                                   ['', 'a', 'b', 'c'])
    299             self.assertTypedEqual(re.split("(:+)", string),
    300                                   ['', ':', 'a', ':', 'b', '::', 'c'])
    301         for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
    302                        memoryview(b":a:b::c")):
    303             self.assertTypedEqual(re.split(b":", string),
    304                                   [b'', b'a', b'b', b'', b'c'])
    305             self.assertTypedEqual(re.split(b":+", string),
    306                                   [b'', b'a', b'b', b'c'])
    307             self.assertTypedEqual(re.split(b"(:+)", string),
    308                                   [b'', b':', b'a', b':', b'b', b'::', b'c'])
    309         for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
    310                         "\U0001d49c\U0001d49e\U0001d4b5"):
    311             string = ":%s:%s::%s" % (a, b, c)
    312             self.assertEqual(re.split(":", string), ['', a, b, '', c])
    313             self.assertEqual(re.split(":+", string), ['', a, b, c])
    314             self.assertEqual(re.split("(:+)", string),
    315                              ['', ':', a, ':', b, '::', c])
    316 
    317         self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
    318         self.assertEqual(re.split("(:)+", ":a:b::c"),
    319                          ['', ':', 'a', ':', 'b', ':', 'c'])
    320         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
    321                          ['', ':', 'a', ':b::', 'c'])
    322         self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
    323                          ['', None, ':', 'a', None, ':', '', 'b', None, '',
    324                           None, '::', 'c'])
    325         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
    326                          ['', 'a', '', '', 'c'])
    327 
    328         for sep, expected in [
    329             (':*', ['', '', 'a', '', 'b', '', 'c', '']),
    330             ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
    331             ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
    332             ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
    333         ]:
    334             with self.subTest(sep=sep):
    335                 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
    336 
    337         for sep, expected in [
    338             ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
    339             (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
    340             (r'(?=:)', ['', ':a', ':b', ':', ':c']),
    341             (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
    342         ]:
    343             with self.subTest(sep=sep):
    344                 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
    345 
    346     def test_qualified_re_split(self):
    347         self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
    348         self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
    349         self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
    350         self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
    351                          ['', ':', 'a', ':', 'b::c'])
    352         self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
    353                          ['', ':', 'a', ':', 'b::c'])
    354         self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
    355                          ['', ':', '', '', 'a:b::c'])
    356 
    357     def test_re_findall(self):
    358         self.assertEqual(re.findall(":+", "abc"), [])
    359         for string in "a:b::c:::d", S("a:b::c:::d"):
    360             self.assertTypedEqual(re.findall(":+", string),
    361                                   [":", "::", ":::"])
    362             self.assertTypedEqual(re.findall("(:+)", string),
    363                                   [":", "::", ":::"])
    364             self.assertTypedEqual(re.findall("(:)(:*)", string),
    365                                   [(":", ""), (":", ":"), (":", "::")])
    366         for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
    367                        memoryview(b"a:b::c:::d")):
    368             self.assertTypedEqual(re.findall(b":+", string),
    369                                   [b":", b"::", b":::"])
    370             self.assertTypedEqual(re.findall(b"(:+)", string),
    371                                   [b":", b"::", b":::"])
    372             self.assertTypedEqual(re.findall(b"(:)(:*)", string),
    373                                   [(b":", b""), (b":", b":"), (b":", b"::")])
    374         for x in ("\xe0", "\u0430", "\U0001d49c"):
    375             xx = x * 2
    376             xxx = x * 3
    377             string = "a%sb%sc%sd" % (x, xx, xxx)
    378             self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
    379             self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
    380             self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
    381                              [(x, ""), (x, x), (x, xx)])
    382 
    383     def test_bug_117612(self):
    384         self.assertEqual(re.findall(r"(a|(b))", "aba"),
    385                          [("a", ""),("b", "b"),("a", "")])
    386 
    387     def test_re_match(self):
    388         for string in 'a', S('a'):
    389             self.assertEqual(re.match('a', string).groups(), ())
    390             self.assertEqual(re.match('(a)', string).groups(), ('a',))
    391             self.assertEqual(re.match('(a)', string).group(0), 'a')
    392             self.assertEqual(re.match('(a)', string).group(1), 'a')
    393             self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
    394         for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
    395             self.assertEqual(re.match(b'a', string).groups(), ())
    396             self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
    397             self.assertEqual(re.match(b'(a)', string).group(0), b'a')
    398             self.assertEqual(re.match(b'(a)', string).group(1), b'a')
    399             self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
    400         for a in ("\xe0", "\u0430", "\U0001d49c"):
    401             self.assertEqual(re.match(a, a).groups(), ())
    402             self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
    403             self.assertEqual(re.match('(%s)' % a, a).group(0), a)
    404             self.assertEqual(re.match('(%s)' % a, a).group(1), a)
    405             self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
    406 
    407         pat = re.compile('((a)|(b))(c)?')
    408         self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
    409         self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
    410         self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
    411         self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
    412         self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
    413 
    414         pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
    415         self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
    416         self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
    417                          (None, 'b', None))
    418         self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
    419 
    420     def test_group(self):
    421         class Index:
    422             def __init__(self, value):
    423                 self.value = value
    424             def __index__(self):
    425                 return self.value
    426         # A single group
    427         m = re.match('(a)(b)', 'ab')
    428         self.assertEqual(m.group(), 'ab')
    429         self.assertEqual(m.group(0), 'ab')
    430         self.assertEqual(m.group(1), 'a')
    431         self.assertEqual(m.group(Index(1)), 'a')
    432         self.assertRaises(IndexError, m.group, -1)
    433         self.assertRaises(IndexError, m.group, 3)
    434         self.assertRaises(IndexError, m.group, 1<<1000)
    435         self.assertRaises(IndexError, m.group, Index(1<<1000))
    436         self.assertRaises(IndexError, m.group, 'x')
    437         # Multiple groups
    438         self.assertEqual(m.group(2, 1), ('b', 'a'))
    439         self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
    440 
    441     def test_match_getitem(self):
    442         pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
    443 
    444         m = pat.match('a')
    445         self.assertEqual(m['a1'], 'a')
    446         self.assertEqual(m['b2'], None)
    447         self.assertEqual(m['c3'], None)
    448         self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
    449         self.assertEqual(m[0], 'a')
    450         self.assertEqual(m[1], 'a')
    451         self.assertEqual(m[2], None)
    452         self.assertEqual(m[3], None)
    453         with self.assertRaisesRegex(IndexError, 'no such group'):
    454             m['X']
    455         with self.assertRaisesRegex(IndexError, 'no such group'):
    456             m[-1]
    457         with self.assertRaisesRegex(IndexError, 'no such group'):
    458             m[4]
    459         with self.assertRaisesRegex(IndexError, 'no such group'):
    460             m[0, 1]
    461         with self.assertRaisesRegex(IndexError, 'no such group'):
    462             m[(0,)]
    463         with self.assertRaisesRegex(IndexError, 'no such group'):
    464             m[(0, 1)]
    465         with self.assertRaisesRegex(IndexError, 'no such group'):
    466             'a1={a2}'.format_map(m)
    467 
    468         m = pat.match('ac')
    469         self.assertEqual(m['a1'], 'a')
    470         self.assertEqual(m['b2'], None)
    471         self.assertEqual(m['c3'], 'c')
    472         self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
    473         self.assertEqual(m[0], 'ac')
    474         self.assertEqual(m[1], 'a')
    475         self.assertEqual(m[2], None)
    476         self.assertEqual(m[3], 'c')
    477 
    478         # Cannot assign.
    479         with self.assertRaises(TypeError):
    480             m[0] = 1
    481 
    482         # No len().
    483         self.assertRaises(TypeError, len, m)
    484 
    485     def test_re_fullmatch(self):
    486         # Issue 16203: Proposal: add re.fullmatch() method.
    487         self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
    488         for string in "ab", S("ab"):
    489             self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
    490         for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
    491             self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
    492         for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
    493             r = r"%s|%s" % (a, a + b)
    494             self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
    495         self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
    496         self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
    497         self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
    498         self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
    499         self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
    500         self.assertIsNone(re.fullmatch(r"a+", "ab"))
    501         self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
    502         self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
    503         self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
    504         self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
    505         self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
    506         self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
    507 
    508         self.assertEqual(
    509             re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
    510         self.assertEqual(
    511             re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
    512         self.assertEqual(
    513             re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
    514 
    515     def test_re_groupref_exists(self):
    516         self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
    517                          ('(', 'a'))
    518         self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
    519                          (None, 'a'))
    520         self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
    521         self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
    522         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
    523                          ('a', 'b'))
    524         self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
    525                          (None, 'd'))
    526         self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
    527                          (None, 'd'))
    528         self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
    529                          ('a', ''))
    530 
    531         # Tests for bug #1177831: exercise groups other than the first group
    532         p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
    533         self.assertEqual(p.match('abc').groups(),
    534                          ('a', 'b', 'c'))
    535         self.assertEqual(p.match('ad').groups(),
    536                          ('a', None, 'd'))
    537         self.assertIsNone(p.match('abd'))
    538         self.assertIsNone(p.match('ac'))
    539 
    540         # Support > 100 groups.
    541         pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
    542         pat = '(?:%s)(?(200)z)' % pat
    543         self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
    544 
    545         self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
    546         self.checkPatternError(r'()(?(1)a|b',
    547                                'missing ), unterminated subpattern', 2)
    548         self.checkPatternError(r'()(?(1)a|b|c)',
    549                                'conditional backref with more than '
    550                                'two branches', 10)
    551 
    552     def test_re_groupref_overflow(self):
    553         from sre_constants import MAXGROUPS
    554         self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
    555                                 'invalid group reference %d' % MAXGROUPS, 3)
    556         self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
    557                                'invalid group reference %d' % MAXGROUPS, 10)
    558 
    559     def test_re_groupref(self):
    560         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
    561                          ('|', 'a'))
    562         self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
    563                          (None, 'a'))
    564         self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
    565         self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
    566         self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
    567                          ('a', 'a'))
    568         self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
    569                          (None, None))
    570 
    571         self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
    572 
    573     def test_groupdict(self):
    574         self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
    575                                   'first second').groupdict(),
    576                          {'first':'first', 'second':'second'})
    577 
    578     def test_expand(self):
    579         self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
    580                                   "first second")
    581                                   .expand(r"\2 \1 \g<second> \g<first>"),
    582                          "second first second first")
    583         self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
    584                                   "first")
    585                                   .expand(r"\2 \g<second>"),
    586                          " ")
    587 
    588     def test_repeat_minmax(self):
    589         self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
    590         self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
    591         self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
    592         self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
    593 
    594         self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
    595         self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
    596         self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
    597         self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
    598         self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
    599         self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
    600         self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
    601         self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
    602 
    603         self.assertIsNone(re.match(r"^x{1}$", "xxx"))
    604         self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
    605         self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
    606         self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
    607 
    608         self.assertTrue(re.match(r"^x{3}$", "xxx"))
    609         self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
    610         self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
    611         self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
    612         self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
    613         self.assertTrue(re.match(r"^x{3}?$", "xxx"))
    614         self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
    615         self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
    616         self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
    617 
    618         self.assertIsNone(re.match(r"^x{}$", "xxx"))
    619         self.assertTrue(re.match(r"^x{}$", "x{}"))
    620 
    621         self.checkPatternError(r'x{2,1}',
    622                                'min repeat greater than max repeat', 2)
    623 
    624     def test_getattr(self):
    625         self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
    626         self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
    627         self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
    628         self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
    629         self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
    630                          {'first': 1, 'other': 2})
    631 
    632         self.assertEqual(re.match("(a)", "a").pos, 0)
    633         self.assertEqual(re.match("(a)", "a").endpos, 1)
    634         self.assertEqual(re.match("(a)", "a").string, "a")
    635         self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
    636         self.assertTrue(re.match("(a)", "a").re)
    637 
    638         # Issue 14260. groupindex should be non-modifiable mapping.
    639         p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
    640         self.assertEqual(sorted(p.groupindex), ['first', 'other'])
    641         self.assertEqual(p.groupindex['other'], 2)
    642         with self.assertRaises(TypeError):
    643             p.groupindex['other'] = 0
    644         self.assertEqual(p.groupindex['other'], 2)
    645 
    646     def test_special_escapes(self):
    647         self.assertEqual(re.search(r"\b(b.)\b",
    648                                    "abcd abc bcd bx").group(1), "bx")
    649         self.assertEqual(re.search(r"\B(b.)\B",
    650                                    "abc bcd bc abxd").group(1), "bx")
    651         self.assertEqual(re.search(r"\b(b.)\b",
    652                                    "abcd abc bcd bx", re.ASCII).group(1), "bx")
    653         self.assertEqual(re.search(r"\B(b.)\B",
    654                                    "abc bcd bc abxd", re.ASCII).group(1), "bx")
    655         self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
    656         self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
    657         self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
    658         self.assertEqual(re.search(br"\b(b.)\b",
    659                                    b"abcd abc bcd bx").group(1), b"bx")
    660         self.assertEqual(re.search(br"\B(b.)\B",
    661                                    b"abc bcd bc abxd").group(1), b"bx")
    662         self.assertEqual(re.search(br"\b(b.)\b",
    663                                    b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
    664         self.assertEqual(re.search(br"\B(b.)\B",
    665                                    b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
    666         self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
    667         self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
    668         self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
    669         self.assertEqual(re.search(r"\d\D\w\W\s\S",
    670                                    "1aa! a").group(0), "1aa! a")
    671         self.assertEqual(re.search(br"\d\D\w\W\s\S",
    672                                    b"1aa! a").group(0), b"1aa! a")
    673         self.assertEqual(re.search(r"\d\D\w\W\s\S",
    674                                    "1aa! a", re.ASCII).group(0), "1aa! a")
    675         self.assertEqual(re.search(br"\d\D\w\W\s\S",
    676                                    b"1aa! a", re.LOCALE).group(0), b"1aa! a")
    677 
    678     def test_other_escapes(self):
    679         self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
    680         self.assertEqual(re.match(r"\(", '(').group(), '(')
    681         self.assertIsNone(re.match(r"\(", ')'))
    682         self.assertEqual(re.match(r"\\", '\\').group(), '\\')
    683         self.assertEqual(re.match(r"[\]]", ']').group(), ']')
    684         self.assertIsNone(re.match(r"[\]]", '['))
    685         self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
    686         self.assertIsNone(re.match(r"[a\-c]", 'b'))
    687         self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
    688         self.assertIsNone(re.match(r"[\^a]+", 'b'))
    689         re.purge()  # for warnings
    690         for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
    691             with self.subTest(c):
    692                 self.assertRaises(re.error, re.compile, '\\%c' % c)
    693         for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
    694             with self.subTest(c):
    695                 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
    696 
    697     def test_string_boundaries(self):
    698         # See http://bugs.python.org/issue10713
    699         self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
    700                          "abc")
    701         # There's a word boundary at the start of a string.
    702         self.assertTrue(re.match(r"\b", "abc"))
    703         # A non-empty string includes a non-boundary zero-length match.
    704         self.assertTrue(re.search(r"\B", "abc"))
    705         # There is no non-boundary match at the start of a string.
    706         self.assertFalse(re.match(r"\B", "abc"))
    707         # However, an empty string contains no word boundaries, and also no
    708         # non-boundaries.
    709         self.assertIsNone(re.search(r"\B", ""))
    710         # This one is questionable and different from the perlre behaviour,
    711         # but describes current behavior.
    712         self.assertIsNone(re.search(r"\b", ""))
    713         # A single word-character string has two boundaries, but no
    714         # non-boundary gaps.
    715         self.assertEqual(len(re.findall(r"\b", "a")), 2)
    716         self.assertEqual(len(re.findall(r"\B", "a")), 0)
    717         # If there are no words, there are no boundaries
    718         self.assertEqual(len(re.findall(r"\b", " ")), 0)
    719         self.assertEqual(len(re.findall(r"\b", "   ")), 0)
    720         # Can match around the whitespace.
    721         self.assertEqual(len(re.findall(r"\B", " ")), 2)
    722 
    723     def test_bigcharset(self):
    724         self.assertEqual(re.match("([\u2222\u2223])",
    725                                   "\u2222").group(1), "\u2222")
    726         r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
    727         self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
    728 
    729     def test_big_codesize(self):
    730         # Issue #1160
    731         r = re.compile('|'.join(('%d'%x for x in range(10000))))
    732         self.assertTrue(r.match('1000'))
    733         self.assertTrue(r.match('9999'))
    734 
    735     def test_anyall(self):
    736         self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
    737                          "a\nb")
    738         self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
    739                          "a\n\nb")
    740 
    741     def test_lookahead(self):
    742         self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
    743         self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
    744         self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
    745         self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
    746         self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
    747         self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
    748         self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
    749 
    750         self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
    751         self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
    752         self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
    753         self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
    754 
    755         # Group reference.
    756         self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
    757         self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
    758         # Conditional group reference.
    759         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
    760         self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
    761         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
    762         self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
    763         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
    764         # Group used before defined.
    765         self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
    766         self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
    767         self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
    768 
    769     def test_lookbehind(self):
    770         self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
    771         self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
    772         self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
    773         self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
    774         # Group reference.
    775         self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
    776         self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
    777         self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
    778         self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
    779         # Conditional group reference.
    780         self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
    781         self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
    782         self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
    783         self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
    784         self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
    785         # Group used before defined.
    786         self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
    787         self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
    788         self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
    789         # Group defined in the same lookbehind pattern
    790         self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
    791         self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
    792         self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
    793         self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
    794 
    795     def test_ignore_case(self):
    796         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
    797         self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
    798         self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
    799         self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
    800         self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
    801         self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
    802         self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
    803         self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
    804         self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
    805         self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
    806 
    807         assert '\u212a'.lower() == 'k' # ''
    808         self.assertTrue(re.match(r'K', '\u212a', re.I))
    809         self.assertTrue(re.match(r'k', '\u212a', re.I))
    810         self.assertTrue(re.match(r'\u212a', 'K', re.I))
    811         self.assertTrue(re.match(r'\u212a', 'k', re.I))
    812         assert '\u017f'.upper() == 'S' # ''
    813         self.assertTrue(re.match(r'S', '\u017f', re.I))
    814         self.assertTrue(re.match(r's', '\u017f', re.I))
    815         self.assertTrue(re.match(r'\u017f', 'S', re.I))
    816         self.assertTrue(re.match(r'\u017f', 's', re.I))
    817         assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', ''
    818         self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
    819         self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
    820 
    821     def test_ignore_case_set(self):
    822         self.assertTrue(re.match(r'[19A]', 'A', re.I))
    823         self.assertTrue(re.match(r'[19a]', 'a', re.I))
    824         self.assertTrue(re.match(r'[19a]', 'A', re.I))
    825         self.assertTrue(re.match(r'[19A]', 'a', re.I))
    826         self.assertTrue(re.match(br'[19A]', b'A', re.I))
    827         self.assertTrue(re.match(br'[19a]', b'a', re.I))
    828         self.assertTrue(re.match(br'[19a]', b'A', re.I))
    829         self.assertTrue(re.match(br'[19A]', b'a', re.I))
    830         assert '\u212a'.lower() == 'k' # ''
    831         self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
    832         self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
    833         self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
    834         self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
    835         assert '\u017f'.upper() == 'S' # ''
    836         self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
    837         self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
    838         self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
    839         self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
    840         assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', ''
    841         self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
    842         self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
    843 
    844     def test_ignore_case_range(self):
    845         # Issues #3511, #17381.
    846         self.assertTrue(re.match(r'[9-a]', '_', re.I))
    847         self.assertIsNone(re.match(r'[9-A]', '_', re.I))
    848         self.assertTrue(re.match(br'[9-a]', b'_', re.I))
    849         self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
    850         self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
    851         self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
    852         self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
    853         self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
    854         self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
    855         self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
    856         self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
    857         self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
    858         self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
    859         self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
    860         self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
    861         self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
    862 
    863         assert '\u212a'.lower() == 'k' # ''
    864         self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
    865         self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
    866         self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
    867         self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
    868         assert '\u017f'.upper() == 'S' # ''
    869         self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
    870         self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
    871         self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
    872         self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
    873         assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '', ''
    874         self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
    875         self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
    876 
    877     def test_category(self):
    878         self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
    879 
    880     @cpython_only
    881     def test_case_helpers(self):
    882         import _sre
    883         for i in range(128):
    884             c = chr(i)
    885             lo = ord(c.lower())
    886             self.assertEqual(_sre.ascii_tolower(i), lo)
    887             self.assertEqual(_sre.unicode_tolower(i), lo)
    888             iscased = c in string.ascii_letters
    889             self.assertEqual(_sre.ascii_iscased(i), iscased)
    890             self.assertEqual(_sre.unicode_iscased(i), iscased)
    891 
    892         for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
    893             c = chr(i)
    894             self.assertEqual(_sre.ascii_tolower(i), i)
    895             if i != 0x0130:
    896                 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
    897             iscased = c != c.lower() or c != c.upper()
    898             self.assertFalse(_sre.ascii_iscased(i))
    899             self.assertEqual(_sre.unicode_iscased(i),
    900                              c != c.lower() or c != c.upper())
    901 
    902         self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
    903         self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
    904         self.assertFalse(_sre.ascii_iscased(0x0130))
    905         self.assertTrue(_sre.unicode_iscased(0x0130))
    906 
    907     def test_not_literal(self):
    908         self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
    909         self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
    910 
    911     def test_possible_set_operations(self):
    912         s = bytes(range(128)).decode()
    913         with self.assertWarns(FutureWarning):
    914             p = re.compile(r'[0-9--1]')
    915         self.assertEqual(p.findall(s), list('-./0123456789'))
    916         self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
    917         with self.assertWarns(FutureWarning):
    918             p = re.compile(r'[%--1]')
    919         self.assertEqual(p.findall(s), list("%&'()*+,-1"))
    920         with self.assertWarns(FutureWarning):
    921             p = re.compile(r'[%--]')
    922         self.assertEqual(p.findall(s), list("%&'()*+,-"))
    923 
    924         with self.assertWarns(FutureWarning):
    925             p = re.compile(r'[0-9&&1]')
    926         self.assertEqual(p.findall(s), list('&0123456789'))
    927         with self.assertWarns(FutureWarning):
    928             p = re.compile(r'[\d&&1]')
    929         self.assertEqual(p.findall(s), list('&0123456789'))
    930         self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
    931 
    932         with self.assertWarns(FutureWarning):
    933             p = re.compile(r'[0-9||a]')
    934         self.assertEqual(p.findall(s), list('0123456789a|'))
    935         with self.assertWarns(FutureWarning):
    936             p = re.compile(r'[\d||a]')
    937         self.assertEqual(p.findall(s), list('0123456789a|'))
    938         self.assertEqual(re.findall(r'[||1]', s), list('1|'))
    939 
    940         with self.assertWarns(FutureWarning):
    941             p = re.compile(r'[0-9~~1]')
    942         self.assertEqual(p.findall(s), list('0123456789~'))
    943         with self.assertWarns(FutureWarning):
    944             p = re.compile(r'[\d~~1]')
    945         self.assertEqual(p.findall(s), list('0123456789~'))
    946         self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
    947 
    948         with self.assertWarns(FutureWarning):
    949             p = re.compile(r'[[0-9]|]')
    950         self.assertEqual(p.findall(s), list('0123456789[]'))
    951 
    952         with self.assertWarns(FutureWarning):
    953             p = re.compile(r'[[:digit:]|]')
    954         self.assertEqual(p.findall(s), list(':[]dgit'))
    955 
    956     def test_search_coverage(self):
    957         self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
    958         self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
    959 
    960     def assertMatch(self, pattern, text, match=None, span=None,
    961                     matcher=re.fullmatch):
    962         if match is None and span is None:
    963             # the pattern matches the whole text
    964             match = text
    965             span = (0, len(text))
    966         elif match is None or span is None:
    967             raise ValueError('If match is not None, span should be specified '
    968                              '(and vice versa).')
    969         m = matcher(pattern, text)
    970         self.assertTrue(m)
    971         self.assertEqual(m.group(), match)
    972         self.assertEqual(m.span(), span)
    973 
    974     LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
    975 
    976     def test_re_escape(self):
    977         p = ''.join(chr(i) for i in range(256))
    978         for c in p:
    979             self.assertMatch(re.escape(c), c)
    980             self.assertMatch('[' + re.escape(c) + ']', c)
    981             self.assertMatch('(?x)' + re.escape(c), c)
    982         self.assertMatch(re.escape(p), p)
    983         for c in '-.]{}':
    984             self.assertEqual(re.escape(c)[:1], '\\')
    985         literal_chars = self.LITERAL_CHARS
    986         self.assertEqual(re.escape(literal_chars), literal_chars)
    987 
    988     def test_re_escape_bytes(self):
    989         p = bytes(range(256))
    990         for i in p:
    991             b = bytes([i])
    992             self.assertMatch(re.escape(b), b)
    993             self.assertMatch(b'[' + re.escape(b) + b']', b)
    994             self.assertMatch(b'(?x)' + re.escape(b), b)
    995         self.assertMatch(re.escape(p), p)
    996         for i in b'-.]{}':
    997             b = bytes([i])
    998             self.assertEqual(re.escape(b)[:1], b'\\')
    999         literal_chars = self.LITERAL_CHARS.encode('ascii')
   1000         self.assertEqual(re.escape(literal_chars), literal_chars)
   1001 
   1002     def test_re_escape_non_ascii(self):
   1003         s = 'xxx\u2620\u2620\u2620xxx'
   1004         s_escaped = re.escape(s)
   1005         self.assertEqual(s_escaped, s)
   1006         self.assertMatch(s_escaped, s)
   1007         self.assertMatch('.%s+.' % re.escape('\u2620'), s,
   1008                          'x\u2620\u2620\u2620x', (2, 7), re.search)
   1009 
   1010     def test_re_escape_non_ascii_bytes(self):
   1011         b = 'y\u2620y\u2620y'.encode('utf-8')
   1012         b_escaped = re.escape(b)
   1013         self.assertEqual(b_escaped, b)
   1014         self.assertMatch(b_escaped, b)
   1015         res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
   1016         self.assertEqual(len(res), 2)
   1017 
   1018     def test_pickling(self):
   1019         import pickle
   1020         oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
   1021         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
   1022             pickled = pickle.dumps(oldpat, proto)
   1023             newpat = pickle.loads(pickled)
   1024             self.assertEqual(newpat, oldpat)
   1025         # current pickle expects the _compile() reconstructor in re module
   1026         from re import _compile
   1027 
   1028     def test_copying(self):
   1029         import copy
   1030         p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
   1031         self.assertIs(copy.copy(p), p)
   1032         self.assertIs(copy.deepcopy(p), p)
   1033         m = p.match('12.34')
   1034         self.assertIs(copy.copy(m), m)
   1035         self.assertIs(copy.deepcopy(m), m)
   1036 
   1037     def test_constants(self):
   1038         self.assertEqual(re.I, re.IGNORECASE)
   1039         self.assertEqual(re.L, re.LOCALE)
   1040         self.assertEqual(re.M, re.MULTILINE)
   1041         self.assertEqual(re.S, re.DOTALL)
   1042         self.assertEqual(re.X, re.VERBOSE)
   1043 
   1044     def test_flags(self):
   1045         for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
   1046             self.assertTrue(re.compile('^pattern$', flag))
   1047         for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
   1048             self.assertTrue(re.compile(b'^pattern$', flag))
   1049 
   1050     def test_sre_character_literals(self):
   1051         for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
   1052             if i < 256:
   1053                 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
   1054                 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
   1055                 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
   1056                 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
   1057                 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
   1058                 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
   1059             if i < 0x10000:
   1060                 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
   1061                 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
   1062                 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
   1063             self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
   1064             self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
   1065             self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
   1066         self.assertTrue(re.match(r"\0", "\000"))
   1067         self.assertTrue(re.match(r"\08", "\0008"))
   1068         self.assertTrue(re.match(r"\01", "\001"))
   1069         self.assertTrue(re.match(r"\018", "\0018"))
   1070         self.checkPatternError(r"\567",
   1071                                r'octal escape value \567 outside of '
   1072                                r'range 0-0o377', 0)
   1073         self.checkPatternError(r"\911", 'invalid group reference 91', 1)
   1074         self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
   1075         self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
   1076         self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
   1077         self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
   1078         self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
   1079         self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
   1080         self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
   1081 
   1082     def test_sre_character_class_literals(self):
   1083         for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
   1084             if i < 256:
   1085                 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
   1086                 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
   1087                 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
   1088                 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
   1089                 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
   1090                 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
   1091                 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
   1092                 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
   1093             if i < 0x10000:
   1094                 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
   1095                 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
   1096                 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
   1097             self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
   1098             self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
   1099             self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
   1100         self.checkPatternError(r"[\567]",
   1101                                r'octal escape value \567 outside of '
   1102                                r'range 0-0o377', 1)
   1103         self.checkPatternError(r"[\911]", r'bad escape \9', 1)
   1104         self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
   1105         self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
   1106         self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
   1107         self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
   1108         self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
   1109 
   1110     def test_sre_byte_literals(self):
   1111         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
   1112             self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
   1113             self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
   1114             self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
   1115             self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
   1116             self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
   1117             self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
   1118         self.assertRaises(re.error, re.compile, br"\u1234")
   1119         self.assertRaises(re.error, re.compile, br"\U00012345")
   1120         self.assertTrue(re.match(br"\0", b"\000"))
   1121         self.assertTrue(re.match(br"\08", b"\0008"))
   1122         self.assertTrue(re.match(br"\01", b"\001"))
   1123         self.assertTrue(re.match(br"\018", b"\0018"))
   1124         self.checkPatternError(br"\567",
   1125                                r'octal escape value \567 outside of '
   1126                                r'range 0-0o377', 0)
   1127         self.checkPatternError(br"\911", 'invalid group reference 91', 1)
   1128         self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
   1129         self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
   1130 
   1131     def test_sre_byte_class_literals(self):
   1132         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
   1133             self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
   1134             self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
   1135             self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
   1136             self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
   1137             self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
   1138             self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
   1139             self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
   1140             self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
   1141         self.assertRaises(re.error, re.compile, br"[\u1234]")
   1142         self.assertRaises(re.error, re.compile, br"[\U00012345]")
   1143         self.checkPatternError(br"[\567]",
   1144                                r'octal escape value \567 outside of '
   1145                                r'range 0-0o377', 1)
   1146         self.checkPatternError(br"[\911]", r'bad escape \9', 1)
   1147         self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
   1148 
   1149     def test_character_set_errors(self):
   1150         self.checkPatternError(r'[', 'unterminated character set', 0)
   1151         self.checkPatternError(r'[^', 'unterminated character set', 0)
   1152         self.checkPatternError(r'[a', 'unterminated character set', 0)
   1153         # bug 545855 -- This pattern failed to cause a compile error as it
   1154         # should, instead provoking a TypeError.
   1155         self.checkPatternError(r"[a-", 'unterminated character set', 0)
   1156         self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
   1157         self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
   1158         self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
   1159 
   1160     def test_bug_113254(self):
   1161         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
   1162         self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
   1163         self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
   1164 
   1165     def test_bug_527371(self):
   1166         # bug described in patches 527371/672491
   1167         self.assertIsNone(re.match(r'(a)?a','a').lastindex)
   1168         self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
   1169         self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
   1170         self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
   1171         self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
   1172 
   1173     def test_bug_418626(self):
   1174         # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
   1175         # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
   1176         # pattern '*?' on a long string.
   1177         self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
   1178         self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
   1179                          20003)
   1180         self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
   1181         # non-simple '*?' still used to hit the recursion limit, before the
   1182         # non-recursive scheme was implemented.
   1183         self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
   1184 
   1185     def test_bug_612074(self):
   1186         pat="["+re.escape("\u2039")+"]"
   1187         self.assertEqual(re.compile(pat) and 1, 1)
   1188 
   1189     def test_stack_overflow(self):
   1190         # nasty cases that used to overflow the straightforward recursive
   1191         # implementation of repeated groups.
   1192         self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
   1193         self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
   1194         self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
   1195 
   1196     def test_nothing_to_repeat(self):
   1197         for reps in '*', '+', '?', '{1,2}':
   1198             for mod in '', '?':
   1199                 self.checkPatternError('%s%s' % (reps, mod),
   1200                                        'nothing to repeat', 0)
   1201                 self.checkPatternError('(?:%s%s)' % (reps, mod),
   1202                                        'nothing to repeat', 3)
   1203 
   1204     def test_multiple_repeat(self):
   1205         for outer_reps in '*', '+', '{1,2}':
   1206             for outer_mod in '', '?':
   1207                 outer_op = outer_reps + outer_mod
   1208                 for inner_reps in '*', '+', '?', '{1,2}':
   1209                     for inner_mod in '', '?':
   1210                         inner_op = inner_reps + inner_mod
   1211                         self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
   1212                                 'multiple repeat', 1 + len(inner_op))
   1213 
   1214     def test_unlimited_zero_width_repeat(self):
   1215         # Issue #9669
   1216         self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
   1217         self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
   1218         self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
   1219         self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
   1220         self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
   1221         self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
   1222 
   1223     def test_scanner(self):
   1224         def s_ident(scanner, token): return token
   1225         def s_operator(scanner, token): return "op%s" % token
   1226         def s_float(scanner, token): return float(token)
   1227         def s_int(scanner, token): return int(token)
   1228 
   1229         scanner = Scanner([
   1230             (r"[a-zA-Z_]\w*", s_ident),
   1231             (r"\d+\.\d*", s_float),
   1232             (r"\d+", s_int),
   1233             (r"=|\+|-|\*|/", s_operator),
   1234             (r"\s+", None),
   1235             ])
   1236 
   1237         self.assertTrue(scanner.scanner.scanner("").pattern)
   1238 
   1239         self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
   1240                          (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
   1241                            'op+', 'bar'], ''))
   1242 
   1243     def test_bug_448951(self):
   1244         # bug 448951 (similar to 429357, but with single char match)
   1245         # (Also test greedy matches.)
   1246         for op in '','?','*':
   1247             self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
   1248                              (None, None))
   1249             self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
   1250                              ('a:', 'a'))
   1251 
   1252     def test_bug_725106(self):
   1253         # capturing groups in alternatives in repeats
   1254         self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
   1255                          ('b', 'a'))
   1256         self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
   1257                          ('c', 'b'))
   1258         self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
   1259                          ('b', None))
   1260         self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
   1261                          ('b', None))
   1262         self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
   1263                          ('b', 'a'))
   1264         self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
   1265                          ('c', 'b'))
   1266         self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
   1267                          ('b', None))
   1268         self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
   1269                          ('b', None))
   1270 
   1271     def test_bug_725149(self):
   1272         # mark_stack_base restoring before restoring marks
   1273         self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
   1274                          ('a', None))
   1275         self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
   1276                          ('a', None, None))
   1277 
   1278     def test_bug_764548(self):
   1279         # bug 764548, re.compile() barfs on str/unicode subclasses
   1280         class my_unicode(str): pass
   1281         pat = re.compile(my_unicode("abc"))
   1282         self.assertIsNone(pat.match("xyz"))
   1283 
   1284     def test_finditer(self):
   1285         iter = re.finditer(r":+", "a:b::c:::d")
   1286         self.assertEqual([item.group(0) for item in iter],
   1287                          [":", "::", ":::"])
   1288 
   1289         pat = re.compile(r":+")
   1290         iter = pat.finditer("a:b::c:::d", 1, 10)
   1291         self.assertEqual([item.group(0) for item in iter],
   1292                          [":", "::", ":::"])
   1293 
   1294         pat = re.compile(r":+")
   1295         iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
   1296         self.assertEqual([item.group(0) for item in iter],
   1297                          [":", "::", ":::"])
   1298 
   1299         pat = re.compile(r":+")
   1300         iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
   1301         self.assertEqual([item.group(0) for item in iter],
   1302                          [":", "::", ":::"])
   1303 
   1304         pat = re.compile(r":+")
   1305         iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
   1306         self.assertEqual([item.group(0) for item in iter],
   1307                          ["::", "::"])
   1308 
   1309     def test_bug_926075(self):
   1310         self.assertIsNot(re.compile('bug_926075'),
   1311                          re.compile(b'bug_926075'))
   1312 
   1313     def test_bug_931848(self):
   1314         pattern = "[\u002E\u3002\uFF0E\uFF61]"
   1315         self.assertEqual(re.compile(pattern).split("a.b.c"),
   1316                          ['a','b','c'])
   1317 
   1318     def test_bug_581080(self):
   1319         iter = re.finditer(r"\s", "a b")
   1320         self.assertEqual(next(iter).span(), (1,2))
   1321         self.assertRaises(StopIteration, next, iter)
   1322 
   1323         scanner = re.compile(r"\s").scanner("a b")
   1324         self.assertEqual(scanner.search().span(), (1, 2))
   1325         self.assertIsNone(scanner.search())
   1326 
   1327     def test_bug_817234(self):
   1328         iter = re.finditer(r".*", "asdf")
   1329         self.assertEqual(next(iter).span(), (0, 4))
   1330         self.assertEqual(next(iter).span(), (4, 4))
   1331         self.assertRaises(StopIteration, next, iter)
   1332 
   1333     def test_bug_6561(self):
   1334         # '\d' should match characters in Unicode category 'Nd'
   1335         # (Number, Decimal Digit), but not those in 'Nl' (Number,
   1336         # Letter) or 'No' (Number, Other).
   1337         decimal_digits = [
   1338             '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
   1339             '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
   1340             '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
   1341             ]
   1342         for x in decimal_digits:
   1343             self.assertEqual(re.match(r'^\d$', x).group(0), x)
   1344 
   1345         not_decimal_digits = [
   1346             '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
   1347             '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
   1348             '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
   1349             '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
   1350             ]
   1351         for x in not_decimal_digits:
   1352             self.assertIsNone(re.match(r'^\d$', x))
   1353 
   1354     def test_empty_array(self):
   1355         # SF buf 1647541
   1356         import array
   1357         for typecode in 'bBuhHiIlLfd':
   1358             a = array.array(typecode)
   1359             self.assertIsNone(re.compile(b"bla").match(a))
   1360             self.assertEqual(re.compile(b"").match(a).groups(), ())
   1361 
   1362     def test_inline_flags(self):
   1363         # Bug #1700
   1364         upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
   1365         lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
   1366 
   1367         p = re.compile('.' + upper_char, re.I | re.S)
   1368         q = p.match('\n' + lower_char)
   1369         self.assertTrue(q)
   1370 
   1371         p = re.compile('.' + lower_char, re.I | re.S)
   1372         q = p.match('\n' + upper_char)
   1373         self.assertTrue(q)
   1374 
   1375         p = re.compile('(?i).' + upper_char, re.S)
   1376         q = p.match('\n' + lower_char)
   1377         self.assertTrue(q)
   1378 
   1379         p = re.compile('(?i).' + lower_char, re.S)
   1380         q = p.match('\n' + upper_char)
   1381         self.assertTrue(q)
   1382 
   1383         p = re.compile('(?is).' + upper_char)
   1384         q = p.match('\n' + lower_char)
   1385         self.assertTrue(q)
   1386 
   1387         p = re.compile('(?is).' + lower_char)
   1388         q = p.match('\n' + upper_char)
   1389         self.assertTrue(q)
   1390 
   1391         p = re.compile('(?s)(?i).' + upper_char)
   1392         q = p.match('\n' + lower_char)
   1393         self.assertTrue(q)
   1394 
   1395         p = re.compile('(?s)(?i).' + lower_char)
   1396         q = p.match('\n' + upper_char)
   1397         self.assertTrue(q)
   1398 
   1399         self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
   1400         self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
   1401         self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
   1402         self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
   1403         self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
   1404 
   1405         p = upper_char + '(?i)'
   1406         with self.assertWarns(DeprecationWarning) as warns:
   1407             self.assertTrue(re.match(p, lower_char))
   1408         self.assertEqual(
   1409             str(warns.warnings[0].message),
   1410             'Flags not at the start of the expression %r' % p
   1411         )
   1412         self.assertEqual(warns.warnings[0].filename, __file__)
   1413 
   1414         p = upper_char + '(?i)%s' % ('.?' * 100)
   1415         with self.assertWarns(DeprecationWarning) as warns:
   1416             self.assertTrue(re.match(p, lower_char))
   1417         self.assertEqual(
   1418             str(warns.warnings[0].message),
   1419             'Flags not at the start of the expression %r (truncated)' % p[:20]
   1420         )
   1421         self.assertEqual(warns.warnings[0].filename, __file__)
   1422 
   1423         # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
   1424         with warnings.catch_warnings():
   1425             warnings.simplefilter('error', BytesWarning)
   1426             p = b'A(?i)'
   1427             with self.assertWarns(DeprecationWarning) as warns:
   1428                 self.assertTrue(re.match(p, b'a'))
   1429             self.assertEqual(
   1430                 str(warns.warnings[0].message),
   1431                 'Flags not at the start of the expression %r' % p
   1432             )
   1433             self.assertEqual(warns.warnings[0].filename, __file__)
   1434 
   1435         with self.assertWarns(DeprecationWarning):
   1436             self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
   1437         with self.assertWarns(DeprecationWarning):
   1438             self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
   1439         with self.assertWarns(DeprecationWarning):
   1440             self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
   1441         with self.assertWarns(DeprecationWarning):
   1442             self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
   1443         with self.assertWarns(DeprecationWarning):
   1444             self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
   1445         with self.assertWarns(DeprecationWarning) as warns:
   1446             self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
   1447         self.assertRegex(str(warns.warnings[0].message),
   1448                          'Flags not at the start')
   1449         self.assertEqual(warns.warnings[0].filename, __file__)
   1450         with self.assertWarns(DeprecationWarning) as warns:
   1451             self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
   1452                                          lower_char))
   1453         self.assertRegex(str(warns.warnings[0].message),
   1454                          'Flags not at the start')
   1455         self.assertEqual(warns.warnings[0].filename, __file__)
   1456         with self.assertWarns(DeprecationWarning) as warns:
   1457             self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
   1458                                          lower_char))
   1459         self.assertRegex(str(warns.warnings[0].message),
   1460                          'Flags not at the start')
   1461         self.assertEqual(warns.warnings[0].filename, __file__)
   1462 
   1463 
   1464     def test_dollar_matches_twice(self):
   1465         "$ matches the end of string, and just before the terminating \n"
   1466         pattern = re.compile('$')
   1467         self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
   1468         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
   1469         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
   1470 
   1471         pattern = re.compile('$', re.MULTILINE)
   1472         self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
   1473         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
   1474         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
   1475 
   1476     def test_bytes_str_mixing(self):
   1477         # Mixing str and bytes is disallowed
   1478         pat = re.compile('.')
   1479         bpat = re.compile(b'.')
   1480         self.assertRaises(TypeError, pat.match, b'b')
   1481         self.assertRaises(TypeError, bpat.match, 'b')
   1482         self.assertRaises(TypeError, pat.sub, b'b', 'c')
   1483         self.assertRaises(TypeError, pat.sub, 'b', b'c')
   1484         self.assertRaises(TypeError, pat.sub, b'b', b'c')
   1485         self.assertRaises(TypeError, bpat.sub, b'b', 'c')
   1486         self.assertRaises(TypeError, bpat.sub, 'b', b'c')
   1487         self.assertRaises(TypeError, bpat.sub, 'b', 'c')
   1488 
   1489     def test_ascii_and_unicode_flag(self):
   1490         # String patterns
   1491         for flags in (0, re.UNICODE):
   1492             pat = re.compile('\xc0', flags | re.IGNORECASE)
   1493             self.assertTrue(pat.match('\xe0'))
   1494             pat = re.compile(r'\w', flags)
   1495             self.assertTrue(pat.match('\xe0'))
   1496         pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
   1497         self.assertIsNone(pat.match('\xe0'))
   1498         pat = re.compile('(?a)\xc0', re.IGNORECASE)
   1499         self.assertIsNone(pat.match('\xe0'))
   1500         pat = re.compile(r'\w', re.ASCII)
   1501         self.assertIsNone(pat.match('\xe0'))
   1502         pat = re.compile(r'(?a)\w')
   1503         self.assertIsNone(pat.match('\xe0'))
   1504         # Bytes patterns
   1505         for flags in (0, re.ASCII):
   1506             pat = re.compile(b'\xc0', flags | re.IGNORECASE)
   1507             self.assertIsNone(pat.match(b'\xe0'))
   1508             pat = re.compile(br'\w', flags)
   1509             self.assertIsNone(pat.match(b'\xe0'))
   1510         # Incompatibilities
   1511         self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
   1512         self.assertRaises(re.error, re.compile, br'(?u)\w')
   1513         self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
   1514         self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
   1515         self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
   1516         self.assertRaises(re.error, re.compile, r'(?au)\w')
   1517 
   1518     def test_locale_flag(self):
   1519         # On Windows, Python 3.7 doesn't call setlocale(LC_CTYPE, "") at
   1520         # startup and so the LC_CTYPE locale uses Latin1 encoding by default,
   1521         # whereas getpreferredencoding() returns the ANSI code page. Set
   1522         # temporarily the LC_CTYPE locale to the user preferred encoding to
   1523         # ensure that it uses the ANSI code page.
   1524         oldloc = locale.setlocale(locale.LC_CTYPE, None)
   1525         locale.setlocale(locale.LC_CTYPE, "")
   1526         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldloc)
   1527 
   1528         # Get the current locale encoding
   1529         enc = locale.getpreferredencoding(False)
   1530 
   1531         # Search non-ASCII letter
   1532         for i in range(128, 256):
   1533             try:
   1534                 c = bytes([i]).decode(enc)
   1535                 sletter = c.lower()
   1536                 if sletter == c: continue
   1537                 bletter = sletter.encode(enc)
   1538                 if len(bletter) != 1: continue
   1539                 if bletter.decode(enc) != sletter: continue
   1540                 bpat = re.escape(bytes([i]))
   1541                 break
   1542             except (UnicodeError, TypeError):
   1543                 pass
   1544         else:
   1545             bletter = None
   1546             bpat = b'A'
   1547         # Bytes patterns
   1548         pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
   1549         if bletter:
   1550             self.assertTrue(pat.match(bletter))
   1551         pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
   1552         if bletter:
   1553             self.assertTrue(pat.match(bletter))
   1554         pat = re.compile(bpat, re.IGNORECASE)
   1555         if bletter:
   1556             self.assertIsNone(pat.match(bletter))
   1557         pat = re.compile(br'\w', re.LOCALE)
   1558         if bletter:
   1559             self.assertTrue(pat.match(bletter))
   1560         pat = re.compile(br'(?L)\w')
   1561         if bletter:
   1562             self.assertTrue(pat.match(bletter))
   1563         pat = re.compile(br'\w')
   1564         if bletter:
   1565             self.assertIsNone(pat.match(bletter))
   1566         # Incompatibilities
   1567         self.assertRaises(ValueError, re.compile, '', re.LOCALE)
   1568         self.assertRaises(re.error, re.compile, '(?L)')
   1569         self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
   1570         self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
   1571         self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
   1572         self.assertRaises(re.error, re.compile, b'(?aL)')
   1573 
   1574     def test_scoped_flags(self):
   1575         self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
   1576         self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
   1577         self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
   1578         self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
   1579         self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
   1580         self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
   1581 
   1582         self.assertTrue(re.match(r'(?x: a) b', 'a b'))
   1583         self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
   1584         self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
   1585         self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
   1586 
   1587         self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
   1588         self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
   1589         self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
   1590 
   1591         self.checkPatternError(r'(?a)(?-a:\w)',
   1592                 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
   1593         self.checkPatternError(r'(?i-i:a)',
   1594                 'bad inline flags: flag turned on and off', 5)
   1595         self.checkPatternError(r'(?au:a)',
   1596                 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
   1597         self.checkPatternError(br'(?aL:a)',
   1598                 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
   1599 
   1600         self.checkPatternError(r'(?-', 'missing flag', 3)
   1601         self.checkPatternError(r'(?-+', 'missing flag', 3)
   1602         self.checkPatternError(r'(?-z', 'unknown flag', 3)
   1603         self.checkPatternError(r'(?-i', 'missing :', 4)
   1604         self.checkPatternError(r'(?-i)', 'missing :', 4)
   1605         self.checkPatternError(r'(?-i+', 'missing :', 4)
   1606         self.checkPatternError(r'(?-iz', 'unknown flag', 4)
   1607         self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
   1608         self.checkPatternError(r'(?i', 'missing -, : or )', 3)
   1609         self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
   1610         self.checkPatternError(r'(?iz', 'unknown flag', 3)
   1611 
   1612     def test_bug_6509(self):
   1613         # Replacement strings of both types must parse properly.
   1614         # all strings
   1615         pat = re.compile(r'a(\w)')
   1616         self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
   1617         pat = re.compile('a(.)')
   1618         self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
   1619         pat = re.compile('..')
   1620         self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
   1621 
   1622         # all bytes
   1623         pat = re.compile(br'a(\w)')
   1624         self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
   1625         pat = re.compile(b'a(.)')
   1626         self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
   1627         pat = re.compile(b'..')
   1628         self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
   1629 
   1630     def test_dealloc(self):
   1631         # issue 3299: check for segfault in debug build
   1632         import _sre
   1633         # the overflow limit is different on wide and narrow builds and it
   1634         # depends on the definition of SRE_CODE (see sre.h).
   1635         # 2**128 should be big enough to overflow on both. For smaller values
   1636         # a RuntimeError is raised instead of OverflowError.
   1637         long_overflow = 2**128
   1638         self.assertRaises(TypeError, re.finditer, "a", {})
   1639         with self.assertRaises(OverflowError):
   1640             _sre.compile("abc", 0, [long_overflow], 0, {}, ())
   1641         with self.assertRaises(TypeError):
   1642             _sre.compile({}, 0, [], 0, [], [])
   1643 
   1644     def test_search_dot_unicode(self):
   1645         self.assertTrue(re.search("123.*-", '123abc-'))
   1646         self.assertTrue(re.search("123.*-", '123\xe9-'))
   1647         self.assertTrue(re.search("123.*-", '123\u20ac-'))
   1648         self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
   1649         self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
   1650 
   1651     def test_compile(self):
   1652         # Test return value when given string and pattern as parameter
   1653         pattern = re.compile('random pattern')
   1654         self.assertIsInstance(pattern, re.Pattern)
   1655         same_pattern = re.compile(pattern)
   1656         self.assertIsInstance(same_pattern, re.Pattern)
   1657         self.assertIs(same_pattern, pattern)
   1658         # Test behaviour when not given a string or pattern as parameter
   1659         self.assertRaises(TypeError, re.compile, 0)
   1660 
   1661     @bigmemtest(size=_2G, memuse=1)
   1662     def test_large_search(self, size):
   1663         # Issue #10182: indices were 32-bit-truncated.
   1664         s = 'a' * size
   1665         m = re.search('$', s)
   1666         self.assertIsNotNone(m)
   1667         self.assertEqual(m.start(), size)
   1668         self.assertEqual(m.end(), size)
   1669 
   1670     # The huge memuse is because of re.sub() using a list and a join()
   1671     # to create the replacement result.
   1672     @bigmemtest(size=_2G, memuse=16 + 2)
   1673     def test_large_subn(self, size):
   1674         # Issue #10182: indices were 32-bit-truncated.
   1675         s = 'a' * size
   1676         r, n = re.subn('', '', s)
   1677         self.assertEqual(r, s)
   1678         self.assertEqual(n, size + 1)
   1679 
   1680     def test_bug_16688(self):
   1681         # Issue 16688: Backreferences make case-insensitive regex fail on
   1682         # non-ASCII strings.
   1683         self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
   1684         self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
   1685 
   1686     def test_repeat_minmax_overflow(self):
   1687         # Issue #13169
   1688         string = "x" * 100000
   1689         self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
   1690         self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
   1691         self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
   1692         self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
   1693         self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
   1694         self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
   1695         # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
   1696         self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
   1697         self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
   1698         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
   1699         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
   1700 
   1701     @cpython_only
   1702     def test_repeat_minmax_overflow_maxrepeat(self):
   1703         try:
   1704             from _sre import MAXREPEAT
   1705         except ImportError:
   1706             self.skipTest('requires _sre.MAXREPEAT constant')
   1707         string = "x" * 100000
   1708         self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
   1709         self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
   1710                          (0, 100000))
   1711         self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
   1712         self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
   1713         self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
   1714         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
   1715 
   1716     def test_backref_group_name_in_exception(self):
   1717         # Issue 17341: Poor error message when compiling invalid regex
   1718         self.checkPatternError('(?P=<foo>)',
   1719                                "bad character in group name '<foo>'", 4)
   1720 
   1721     def test_group_name_in_exception(self):
   1722         # Issue 17341: Poor error message when compiling invalid regex
   1723         self.checkPatternError('(?P<?foo>)',
   1724                                "bad character in group name '?foo'", 4)
   1725 
   1726     def test_issue17998(self):
   1727         for reps in '*', '+', '?', '{1}':
   1728             for mod in '', '?':
   1729                 pattern = '.' + reps + mod + 'yz'
   1730                 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
   1731                                  ['xyz'], msg=pattern)
   1732                 pattern = pattern.encode()
   1733                 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
   1734                                  [b'xyz'], msg=pattern)
   1735 
   1736     def test_match_repr(self):
   1737         for string in '[abracadabra]', S('[abracadabra]'):
   1738             m = re.search(r'(.+)(.*?)\1', string)
   1739             self.assertEqual(repr(m), "<%s.%s object; "
   1740                              "span=(1, 12), match='abracadabra'>" %
   1741                              (type(m).__module__, type(m).__qualname__))
   1742         for string in (b'[abracadabra]', B(b'[abracadabra]'),
   1743                        bytearray(b'[abracadabra]'),
   1744                        memoryview(b'[abracadabra]')):
   1745             m = re.search(br'(.+)(.*?)\1', string)
   1746             self.assertEqual(repr(m), "<%s.%s object; "
   1747                              "span=(1, 12), match=b'abracadabra'>" %
   1748                              (type(m).__module__, type(m).__qualname__))
   1749 
   1750         first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
   1751         self.assertEqual(repr(first), "<%s.%s object; "
   1752                          "span=(0, 2), match='aa'>" %
   1753                          (type(second).__module__, type(first).__qualname__))
   1754         self.assertEqual(repr(second), "<%s.%s object; "
   1755                          "span=(3, 5), match='bb'>" %
   1756                          (type(second).__module__, type(second).__qualname__))
   1757 
   1758     def test_zerowidth(self):
   1759         # Issues 852532, 1647489, 3262, 25054.
   1760         self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
   1761         self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
   1762         self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
   1763         self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
   1764 
   1765         self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
   1766         self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
   1767         self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
   1768 
   1769         self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
   1770         self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
   1771                          ['', 'a', '', '', 'bc', ''])
   1772 
   1773         self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
   1774                          [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
   1775         self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
   1776                          [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
   1777 
   1778     def test_bug_2537(self):
   1779         # issue 2537: empty submatches
   1780         for outer_op in ('{0,}', '*', '+', '{1,187}'):
   1781             for inner_op in ('{0,}', '*', '?'):
   1782                 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
   1783                 m = r.match("xyyzy")
   1784                 self.assertEqual(m.group(0), "xyy")
   1785                 self.assertEqual(m.group(1), "")
   1786                 self.assertEqual(m.group(2), "y")
   1787 
   1788     @cpython_only
   1789     def test_debug_flag(self):
   1790         pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
   1791         with captured_stdout() as out:
   1792             re.compile(pat, re.DEBUG)
   1793         self.maxDiff = None
   1794         dump = '''\
   1795 SUBPATTERN 1 0 0
   1796   LITERAL 46
   1797 BRANCH
   1798   IN
   1799     LITERAL 99
   1800     LITERAL 104
   1801 OR
   1802   LITERAL 112
   1803   LITERAL 121
   1804 GROUPREF_EXISTS 1
   1805   AT AT_END
   1806 ELSE
   1807   LITERAL 58
   1808   LITERAL 32
   1809 
   1810  0. INFO 8 0b1 2 5 (to 9)
   1811       prefix_skip 0
   1812       prefix [0x2e] ('.')
   1813       overlap [0]
   1814  9: MARK 0
   1815 11. LITERAL 0x2e ('.')
   1816 13. MARK 1
   1817 15. BRANCH 10 (to 26)
   1818 17.   IN 6 (to 24)
   1819 19.     LITERAL 0x63 ('c')
   1820 21.     LITERAL 0x68 ('h')
   1821 23.     FAILURE
   1822 24:   JUMP 9 (to 34)
   1823 26: branch 7 (to 33)
   1824 27.   LITERAL 0x70 ('p')
   1825 29.   LITERAL 0x79 ('y')
   1826 31.   JUMP 2 (to 34)
   1827 33: FAILURE
   1828 34: GROUPREF_EXISTS 0 6 (to 41)
   1829 37. AT END
   1830 39. JUMP 5 (to 45)
   1831 41: LITERAL 0x3a (':')
   1832 43. LITERAL 0x20 (' ')
   1833 45: SUCCESS
   1834 '''
   1835         self.assertEqual(out.getvalue(), dump)
   1836         # Debug output is output again even a second time (bypassing
   1837         # the cache -- issue #20426).
   1838         with captured_stdout() as out:
   1839             re.compile(pat, re.DEBUG)
   1840         self.assertEqual(out.getvalue(), dump)
   1841 
   1842     def test_keyword_parameters(self):
   1843         # Issue #20283: Accepting the string keyword parameter.
   1844         pat = re.compile(r'(ab)')
   1845         self.assertEqual(
   1846             pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
   1847         self.assertEqual(
   1848             pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
   1849         self.assertEqual(
   1850             pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
   1851         self.assertEqual(
   1852             pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
   1853         self.assertEqual(
   1854             pat.split(string='abracadabra', maxsplit=1),
   1855             ['', 'ab', 'racadabra'])
   1856         self.assertEqual(
   1857             pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
   1858             (7, 9))
   1859 
   1860     def test_bug_20998(self):
   1861         # Issue #20998: Fullmatch of repeated single character pattern
   1862         # with ignore case.
   1863         self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
   1864 
   1865     def test_locale_caching(self):
   1866         # Issue #22410
   1867         oldlocale = locale.setlocale(locale.LC_CTYPE)
   1868         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1869         for loc in 'en_US.iso88591', 'en_US.utf8':
   1870             try:
   1871                 locale.setlocale(locale.LC_CTYPE, loc)
   1872             except locale.Error:
   1873                 # Unsupported locale on this system
   1874                 self.skipTest('test needs %s locale' % loc)
   1875 
   1876         re.purge()
   1877         self.check_en_US_iso88591()
   1878         self.check_en_US_utf8()
   1879         re.purge()
   1880         self.check_en_US_utf8()
   1881         self.check_en_US_iso88591()
   1882 
   1883     def check_en_US_iso88591(self):
   1884         locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
   1885         self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
   1886         self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
   1887         self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
   1888         self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
   1889         self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
   1890         self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
   1891 
   1892     def check_en_US_utf8(self):
   1893         locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
   1894         self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
   1895         self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
   1896         self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
   1897         self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
   1898         self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
   1899         self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
   1900 
   1901     def test_locale_compiled(self):
   1902         oldlocale = locale.setlocale(locale.LC_CTYPE)
   1903         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1904         for loc in 'en_US.iso88591', 'en_US.utf8':
   1905             try:
   1906                 locale.setlocale(locale.LC_CTYPE, loc)
   1907             except locale.Error:
   1908                 # Unsupported locale on this system
   1909                 self.skipTest('test needs %s locale' % loc)
   1910 
   1911         locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
   1912         p1 = re.compile(b'\xc5\xe5', re.L|re.I)
   1913         p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
   1914         p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
   1915         p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
   1916         for p in p1, p2, p3:
   1917             self.assertTrue(p.match(b'\xc5\xe5'))
   1918             self.assertTrue(p.match(b'\xe5\xe5'))
   1919             self.assertTrue(p.match(b'\xc5\xc5'))
   1920         self.assertIsNone(p4.match(b'\xe5\xc5'))
   1921         self.assertIsNone(p4.match(b'\xe5\xe5'))
   1922         self.assertIsNone(p4.match(b'\xc5\xc5'))
   1923 
   1924         locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
   1925         for p in p1, p2, p3:
   1926             self.assertTrue(p.match(b'\xc5\xe5'))
   1927             self.assertIsNone(p.match(b'\xe5\xe5'))
   1928             self.assertIsNone(p.match(b'\xc5\xc5'))
   1929         self.assertTrue(p4.match(b'\xe5\xc5'))
   1930         self.assertIsNone(p4.match(b'\xe5\xe5'))
   1931         self.assertIsNone(p4.match(b'\xc5\xc5'))
   1932 
   1933     def test_error(self):
   1934         with self.assertRaises(re.error) as cm:
   1935             re.compile('(\u20ac))')
   1936         err = cm.exception
   1937         self.assertIsInstance(err.pattern, str)
   1938         self.assertEqual(err.pattern, '(\u20ac))')
   1939         self.assertEqual(err.pos, 3)
   1940         self.assertEqual(err.lineno, 1)
   1941         self.assertEqual(err.colno, 4)
   1942         self.assertIn(err.msg, str(err))
   1943         self.assertIn(' at position 3', str(err))
   1944         self.assertNotIn(' at position 3', err.msg)
   1945         # Bytes pattern
   1946         with self.assertRaises(re.error) as cm:
   1947             re.compile(b'(\xa4))')
   1948         err = cm.exception
   1949         self.assertIsInstance(err.pattern, bytes)
   1950         self.assertEqual(err.pattern, b'(\xa4))')
   1951         self.assertEqual(err.pos, 3)
   1952         # Multiline pattern
   1953         with self.assertRaises(re.error) as cm:
   1954             re.compile("""
   1955                 (
   1956                     abc
   1957                 )
   1958                 )
   1959                 (
   1960                 """, re.VERBOSE)
   1961         err = cm.exception
   1962         self.assertEqual(err.pos, 77)
   1963         self.assertEqual(err.lineno, 5)
   1964         self.assertEqual(err.colno, 17)
   1965         self.assertIn(err.msg, str(err))
   1966         self.assertIn(' at position 77', str(err))
   1967         self.assertIn('(line 5, column 17)', str(err))
   1968 
   1969     def test_misc_errors(self):
   1970         self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
   1971         self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
   1972         self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
   1973         self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
   1974         self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
   1975         self.checkPatternError(r'(?iz)', 'unknown flag', 3)
   1976         self.checkPatternError(r'(?i', 'missing -, : or )', 3)
   1977         self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
   1978         self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
   1979         self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
   1980         self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
   1981 
   1982     def test_enum(self):
   1983         # Issue #28082: Check that str(flag) returns a human readable string
   1984         # instead of an integer
   1985         self.assertIn('ASCII', str(re.A))
   1986         self.assertIn('DOTALL', str(re.S))
   1987 
   1988     def test_pattern_compare(self):
   1989         pattern1 = re.compile('abc', re.IGNORECASE)
   1990 
   1991         # equal to itself
   1992         self.assertEqual(pattern1, pattern1)
   1993         self.assertFalse(pattern1 != pattern1)
   1994 
   1995         # equal
   1996         re.purge()
   1997         pattern2 = re.compile('abc', re.IGNORECASE)
   1998         self.assertEqual(hash(pattern2), hash(pattern1))
   1999         self.assertEqual(pattern2, pattern1)
   2000 
   2001         # not equal: different pattern
   2002         re.purge()
   2003         pattern3 = re.compile('XYZ', re.IGNORECASE)
   2004         # Don't test hash(pattern3) != hash(pattern1) because there is no
   2005         # warranty that hash values are different
   2006         self.assertNotEqual(pattern3, pattern1)
   2007 
   2008         # not equal: different flag (flags=0)
   2009         re.purge()
   2010         pattern4 = re.compile('abc')
   2011         self.assertNotEqual(pattern4, pattern1)
   2012 
   2013         # only == and != comparison operators are supported
   2014         with self.assertRaises(TypeError):
   2015             pattern1 < pattern2
   2016 
   2017     def test_pattern_compare_bytes(self):
   2018         pattern1 = re.compile(b'abc')
   2019 
   2020         # equal: test bytes patterns
   2021         re.purge()
   2022         pattern2 = re.compile(b'abc')
   2023         self.assertEqual(hash(pattern2), hash(pattern1))
   2024         self.assertEqual(pattern2, pattern1)
   2025 
   2026         # not equal: pattern of a different types (str vs bytes),
   2027         # comparison must not raise a BytesWarning
   2028         re.purge()
   2029         pattern3 = re.compile('abc')
   2030         with warnings.catch_warnings():
   2031             warnings.simplefilter('error', BytesWarning)
   2032             self.assertNotEqual(pattern3, pattern1)
   2033 
   2034     def test_bug_29444(self):
   2035         s = bytearray(b'abcdefgh')
   2036         m = re.search(b'[a-h]+', s)
   2037         m2 = re.search(b'[e-h]+', s)
   2038         self.assertEqual(m.group(), b'abcdefgh')
   2039         self.assertEqual(m2.group(), b'efgh')
   2040         s[:] = b'xyz'
   2041         self.assertEqual(m.group(), b'xyz')
   2042         self.assertEqual(m2.group(), b'')
   2043 
   2044     def test_bug_34294(self):
   2045         # Issue 34294: wrong capturing groups
   2046 
   2047         # exists since Python 2
   2048         s = "a\tx"
   2049         p = r"\b(?=(\t)|(x))x"
   2050         self.assertEqual(re.search(p, s).groups(), (None, 'x'))
   2051 
   2052         # introduced in Python 3.7.0
   2053         s = "ab"
   2054         p = r"(?=(.)(.)?)"
   2055         self.assertEqual(re.findall(p, s),
   2056                          [('a', 'b'), ('b', '')])
   2057         self.assertEqual([m.groups() for m in re.finditer(p, s)],
   2058                          [('a', 'b'), ('b', None)])
   2059 
   2060         # test-cases provided by issue34294, introduced in Python 3.7.0
   2061         p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
   2062         s = "<test><foo2/></test>"
   2063         self.assertEqual(re.findall(p, s),
   2064                          [('test', '<foo2/>'), ('foo2', '')])
   2065         self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
   2066                          [{'tag': 'test', 'text': '<foo2/>'},
   2067                           {'tag': 'foo2', 'text': None}])
   2068         s = "<test>Hello</test><foo/>"
   2069         self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
   2070                          [{'tag': 'test', 'text': 'Hello'},
   2071                           {'tag': 'foo', 'text': None}])
   2072         s = "<test>Hello</test><foo/><foo/>"
   2073         self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
   2074                          [{'tag': 'test', 'text': 'Hello'},
   2075                           {'tag': 'foo', 'text': None},
   2076                           {'tag': 'foo', 'text': None}])
   2077 
   2078 
   2079 class PatternReprTests(unittest.TestCase):
   2080     def check(self, pattern, expected):
   2081         self.assertEqual(repr(re.compile(pattern)), expected)
   2082 
   2083     def check_flags(self, pattern, flags, expected):
   2084         self.assertEqual(repr(re.compile(pattern, flags)), expected)
   2085 
   2086     def test_without_flags(self):
   2087         self.check('random pattern',
   2088                    "re.compile('random pattern')")
   2089 
   2090     def test_single_flag(self):
   2091         self.check_flags('random pattern', re.IGNORECASE,
   2092             "re.compile('random pattern', re.IGNORECASE)")
   2093 
   2094     def test_multiple_flags(self):
   2095         self.check_flags('random pattern', re.I|re.S|re.X,
   2096             "re.compile('random pattern', "
   2097             "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
   2098 
   2099     def test_unicode_flag(self):
   2100         self.check_flags('random pattern', re.U,
   2101                          "re.compile('random pattern')")
   2102         self.check_flags('random pattern', re.I|re.S|re.U,
   2103                          "re.compile('random pattern', "
   2104                          "re.IGNORECASE|re.DOTALL)")
   2105 
   2106     def test_inline_flags(self):
   2107         self.check('(?i)pattern',
   2108                    "re.compile('(?i)pattern', re.IGNORECASE)")
   2109 
   2110     def test_unknown_flags(self):
   2111         self.check_flags('random pattern', 0x123000,
   2112                          "re.compile('random pattern', 0x123000)")
   2113         self.check_flags('random pattern', 0x123000|re.I,
   2114             "re.compile('random pattern', re.IGNORECASE|0x123000)")
   2115 
   2116     def test_bytes(self):
   2117         self.check(b'bytes pattern',
   2118                    "re.compile(b'bytes pattern')")
   2119         self.check_flags(b'bytes pattern', re.A,
   2120                          "re.compile(b'bytes pattern', re.ASCII)")
   2121 
   2122     def test_locale(self):
   2123         self.check_flags(b'bytes pattern', re.L,
   2124                          "re.compile(b'bytes pattern', re.LOCALE)")
   2125 
   2126     def test_quotes(self):
   2127         self.check('random "double quoted" pattern',
   2128             '''re.compile('random "double quoted" pattern')''')
   2129         self.check("random 'single quoted' pattern",
   2130             '''re.compile("random 'single quoted' pattern")''')
   2131         self.check('''both 'single' and "double" quotes''',
   2132             '''re.compile('both \\'single\\' and "double" quotes')''')
   2133 
   2134     def test_long_pattern(self):
   2135         pattern = 'Very %spattern' % ('long ' * 1000)
   2136         r = repr(re.compile(pattern))
   2137         self.assertLess(len(r), 300)
   2138         self.assertEqual(r[:30], "re.compile('Very long long lon")
   2139         r = repr(re.compile(pattern, re.I))
   2140         self.assertLess(len(r), 300)
   2141         self.assertEqual(r[:30], "re.compile('Very long long lon")
   2142         self.assertEqual(r[-16:], ", re.IGNORECASE)")
   2143 
   2144 
   2145 class ImplementationTest(unittest.TestCase):
   2146     """
   2147     Test implementation details of the re module.
   2148     """
   2149 
   2150     def test_overlap_table(self):
   2151         f = sre_compile._generate_overlap_table
   2152         self.assertEqual(f(""), [])
   2153         self.assertEqual(f("a"), [0])
   2154         self.assertEqual(f("abcd"), [0, 0, 0, 0])
   2155         self.assertEqual(f("aaaa"), [0, 1, 2, 3])
   2156         self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
   2157         self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
   2158 
   2159 
   2160 class ExternalTests(unittest.TestCase):
   2161 
   2162     def test_re_benchmarks(self):
   2163         're_tests benchmarks'
   2164         from test.re_tests import benchmarks
   2165         for pattern, s in benchmarks:
   2166             with self.subTest(pattern=pattern, string=s):
   2167                 p = re.compile(pattern)
   2168                 self.assertTrue(p.search(s))
   2169                 self.assertTrue(p.match(s))
   2170                 self.assertTrue(p.fullmatch(s))
   2171                 s2 = ' '*10000 + s + ' '*10000
   2172                 self.assertTrue(p.search(s2))
   2173                 self.assertTrue(p.match(s2, 10000))
   2174                 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
   2175                 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
   2176 
   2177     def test_re_tests(self):
   2178         're_tests test suite'
   2179         from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
   2180         for t in tests:
   2181             pattern = s = outcome = repl = expected = None
   2182             if len(t) == 5:
   2183                 pattern, s, outcome, repl, expected = t
   2184             elif len(t) == 3:
   2185                 pattern, s, outcome = t
   2186             else:
   2187                 raise ValueError('Test tuples should have 3 or 5 fields', t)
   2188 
   2189             with self.subTest(pattern=pattern, string=s):
   2190                 if outcome == SYNTAX_ERROR:  # Expected a syntax error
   2191                     with self.assertRaises(re.error):
   2192                         re.compile(pattern)
   2193                     continue
   2194 
   2195                 obj = re.compile(pattern)
   2196                 result = obj.search(s)
   2197                 if outcome == FAIL:
   2198                     self.assertIsNone(result, 'Succeeded incorrectly')
   2199                     continue
   2200 
   2201                 with self.subTest():
   2202                     self.assertTrue(result, 'Failed incorrectly')
   2203                     # Matched, as expected, so now we compute the
   2204                     # result string and compare it to our expected result.
   2205                     start, end = result.span(0)
   2206                     vardict = {'found': result.group(0),
   2207                                'groups': result.group(),
   2208                                'flags': result.re.flags}
   2209                     for i in range(1, 100):
   2210                         try:
   2211                             gi = result.group(i)
   2212                             # Special hack because else the string concat fails:
   2213                             if gi is None:
   2214                                 gi = "None"
   2215                         except IndexError:
   2216                             gi = "Error"
   2217                         vardict['g%d' % i] = gi
   2218                     for i in result.re.groupindex.keys():
   2219                         try:
   2220                             gi = result.group(i)
   2221                             if gi is None:
   2222                                 gi = "None"
   2223                         except IndexError:
   2224                             gi = "Error"
   2225                         vardict[i] = gi
   2226                     self.assertEqual(eval(repl, vardict), expected,
   2227                                      'grouping error')
   2228 
   2229                 # Try the match with both pattern and string converted to
   2230                 # bytes, and check that it still succeeds.
   2231                 try:
   2232                     bpat = bytes(pattern, "ascii")
   2233                     bs = bytes(s, "ascii")
   2234                 except UnicodeEncodeError:
   2235                     # skip non-ascii tests
   2236                     pass
   2237                 else:
   2238                     with self.subTest('bytes pattern match'):
   2239                         obj = re.compile(bpat)
   2240                         self.assertTrue(obj.search(bs))
   2241 
   2242                     # Try the match with LOCALE enabled, and check that it
   2243                     # still succeeds.
   2244                     with self.subTest('locale-sensitive match'):
   2245                         obj = re.compile(bpat, re.LOCALE)
   2246                         result = obj.search(bs)
   2247                         if result is None:
   2248                             print('=== Fails on locale-sensitive match', t)
   2249 
   2250                 # Try the match with the search area limited to the extent
   2251                 # of the match and see if it still succeeds.  \B will
   2252                 # break (because it won't match at the end or start of a
   2253                 # string), so we'll ignore patterns that feature it.
   2254                 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
   2255                             and result is not None):
   2256                     with self.subTest('range-limited match'):
   2257                         obj = re.compile(pattern)
   2258                         self.assertTrue(obj.search(s, start, end + 1))
   2259 
   2260                 # Try the match with IGNORECASE enabled, and check that it
   2261                 # still succeeds.
   2262                 with self.subTest('case-insensitive match'):
   2263                     obj = re.compile(pattern, re.IGNORECASE)
   2264                     self.assertTrue(obj.search(s))
   2265 
   2266                 # Try the match with UNICODE locale enabled, and check
   2267                 # that it still succeeds.
   2268                 with self.subTest('unicode-sensitive match'):
   2269                     obj = re.compile(pattern, re.UNICODE)
   2270                     self.assertTrue(obj.search(s))
   2271 
   2272 
   2273 if __name__ == "__main__":
   2274     unittest.main()
   2275