Home | History | Annotate | Download | only in test
      1 # -*- coding: utf-8 -*-
      2 from test.test_support import (
      3     verbose, run_unittest, import_module,
      4     precisionbigmemtest, _2G, cpython_only,
      5     captured_stdout, have_unicode, requires_unicode, u,
      6     check_warnings)
      7 import locale
      8 import re
      9 from re import Scanner
     10 import sre_constants
     11 import sys
     12 import string
     13 import traceback
     14 from weakref import proxy
     15 
     16 
     17 # Misc tests from Tim Peters' re.doc
     18 
     19 # WARNING: Don't change details in these tests if you don't know
     20 # what you're doing. Some of these tests were carefully modeled to
     21 # cover most of the code.
     22 
     23 import unittest
     24 
     25 class ReTests(unittest.TestCase):
     26 
     27     def test_weakref(self):
     28         s = 'QabbbcR'
     29         x = re.compile('ab+c')
     30         y = proxy(x)
     31         self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
     32 
     33     def test_search_star_plus(self):
     34         self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
     35         self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
     36         self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
     37         self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
     38         self.assertIsNone(re.search('x', 'aaa'))
     39         self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
     40         self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
     41         self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
     42         self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
     43         self.assertIsNone(re.match('a+', 'xxx'))
     44 
     45     def bump_num(self, matchobj):
     46         int_value = int(matchobj.group(0))
     47         return str(int_value + 1)
     48 
     49     def test_basic_re_sub(self):
     50         self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
     51         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
     52                          '9.3 -3 24x100y')
     53         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
     54                          '9.3 -3 23x99y')
     55 
     56         self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
     57         self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
     58 
     59         s = r"\1\1"
     60         self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
     61         self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
     62         self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
     63 
     64         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
     65         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
     66         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
     67         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
     68 
     69         self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
     70                          '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
     71         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
     72         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
     73                          (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
     74 
     75         self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
     76 
     77     def test_bug_449964(self):
     78         # fails for group followed by other escape
     79         self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
     80                          'xx\bxx\b')
     81 
     82     def test_bug_449000(self):
     83         # Test for sub() on escaped characters
     84         self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
     85                          'abc\ndef\n')
     86         self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
     87                          'abc\ndef\n')
     88         self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
     89                          'abc\ndef\n')
     90         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
     91                          'abc\ndef\n')
     92 
     93     @requires_unicode
     94     def test_bug_1140(self):
     95         # re.sub(x, y, u'') should return u'', not '', and
     96         # re.sub(x, y, '') should return '', not u''.
     97         # Also:
     98         # re.sub(x, y, unicode(x)) should return unicode(y), and
     99         # re.sub(x, y, str(x)) should return
    100         #     str(y) if isinstance(y, str) else unicode(y).
    101         for x in 'x', u'x':
    102             for y in 'y', u'y':
    103                 z = re.sub(x, y, u'')
    104                 self.assertEqual(z, u'')
    105                 self.assertEqual(type(z), unicode)
    106                 #
    107                 z = re.sub(x, y, '')
    108                 self.assertEqual(z, '')
    109                 self.assertEqual(type(z), str)
    110                 #
    111                 z = re.sub(x, y, unicode(x))
    112                 self.assertEqual(z, y)
    113                 self.assertEqual(type(z), unicode)
    114                 #
    115                 z = re.sub(x, y, str(x))
    116                 self.assertEqual(z, y)
    117                 self.assertEqual(type(z), type(y))
    118 
    119     def test_bug_1661(self):
    120         # Verify that flags do not get silently ignored with compiled patterns
    121         pattern = re.compile('.')
    122         self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
    123         self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
    124         self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
    125         self.assertRaises(ValueError, re.compile, pattern, re.I)
    126 
    127     def test_bug_3629(self):
    128         # A regex that triggered a bug in the sre-code validator
    129         re.compile("(?P<quote>)(?(quote))")
    130 
    131     def test_sub_template_numeric_escape(self):
    132         # bug 776311 and friends
    133         self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
    134         self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
    135         self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
    136         self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
    137         self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
    138         self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
    139         self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
    140 
    141         self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
    142         self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
    143 
    144         self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
    145         self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
    146         self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
    147         self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
    148         self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
    149 
    150         self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
    151         self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
    152 
    153         self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
    154         self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
    155         self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
    156         self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
    157         self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
    158         self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
    159         self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
    160         self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
    161         self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
    162         self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
    163         self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
    164         self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
    165 
    166         # in python2.3 (etc), these loop endlessly in sre_parser.py
    167         self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
    168         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
    169                          'xz8')
    170         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
    171                          'xza')
    172 
    173     def test_qualified_re_sub(self):
    174         self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
    175         self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
    176 
    177     def test_bug_114660(self):
    178         self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
    179                          'hello there')
    180 
    181     def test_bug_462270(self):
    182         # Test for empty sub() behaviour, see SF bug #462270
    183         self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
    184         self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
    185 
    186     def test_symbolic_groups(self):
    187         re.compile('(?P<a>x)(?P=a)(?(a)y)')
    188         re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
    189         self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
    190         self.assertRaises(re.error, re.compile, '(?Px)')
    191         self.assertRaises(re.error, re.compile, '(?P=)')
    192         self.assertRaises(re.error, re.compile, '(?P=1)')
    193         self.assertRaises(re.error, re.compile, '(?P=a)')
    194         self.assertRaises(re.error, re.compile, '(?P=a1)')
    195         self.assertRaises(re.error, re.compile, '(?P=a.)')
    196         self.assertRaises(re.error, re.compile, '(?P<)')
    197         self.assertRaises(re.error, re.compile, '(?P<>)')
    198         self.assertRaises(re.error, re.compile, '(?P<1>)')
    199         self.assertRaises(re.error, re.compile, '(?P<a.>)')
    200         self.assertRaises(re.error, re.compile, '(?())')
    201         self.assertRaises(re.error, re.compile, '(?(a))')
    202         self.assertRaises(re.error, re.compile, '(?(1a))')
    203         self.assertRaises(re.error, re.compile, '(?(a.))')
    204 
    205     def test_symbolic_refs(self):
    206         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
    207         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
    208         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
    209         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
    210         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
    211         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
    212         self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
    213         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
    214         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
    215         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
    216 
    217     def test_re_subn(self):
    218         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
    219         self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
    220         self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
    221         self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
    222         self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
    223 
    224     def test_re_split(self):
    225         self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
    226         self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
    227         self.assertEqual(re.split("(:*)", ":a:b::c"),
    228                          ['', ':', 'a', ':', 'b', '::', 'c'])
    229         self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
    230         self.assertEqual(re.split("(:)*", ":a:b::c"),
    231                          ['', ':', 'a', ':', 'b', ':', 'c'])
    232         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
    233                          ['', ':', 'a', ':b::', 'c'])
    234         self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
    235                          ['', None, ':', 'a', None, ':', '', 'b', None, '',
    236                           None, '::', 'c'])
    237         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
    238                          ['', 'a', '', '', 'c'])
    239 
    240     def test_qualified_re_split(self):
    241         self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
    242         self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
    243         self.assertEqual(re.split("(:)", ":a:b::c", 2),
    244                          ['', ':', 'a', ':', 'b::c'])
    245         self.assertEqual(re.split("(:*)", ":a:b::c", 2),
    246                          ['', ':', 'a', ':', 'b::c'])
    247 
    248     def test_re_findall(self):
    249         self.assertEqual(re.findall(":+", "abc"), [])
    250         self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
    251         self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
    252         self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
    253                                                                (":", ":"),
    254                                                                (":", "::")])
    255 
    256     def test_bug_117612(self):
    257         self.assertEqual(re.findall(r"(a|(b))", "aba"),
    258                          [("a", ""),("b", "b"),("a", "")])
    259 
    260     def test_re_match(self):
    261         self.assertEqual(re.match('a', 'a').groups(), ())
    262         self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
    263         self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
    264         self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
    265         self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
    266 
    267         pat = re.compile('((a)|(b))(c)?')
    268         self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
    269         self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
    270         self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
    271         self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
    272         self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
    273 
    274         # A single group
    275         m = re.match('(a)', 'a')
    276         self.assertEqual(m.group(0), 'a')
    277         self.assertEqual(m.group(0), 'a')
    278         self.assertEqual(m.group(1), 'a')
    279         self.assertEqual(m.group(1, 1), ('a', 'a'))
    280 
    281         pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
    282         self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
    283         self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
    284                          (None, 'b', None))
    285         self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
    286 
    287     def test_re_groupref_exists(self):
    288         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
    289                          ('(', 'a'))
    290         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
    291                          (None, 'a'))
    292         self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
    293         self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
    294         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
    295                          ('a', 'b'))
    296         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
    297                          (None, 'd'))
    298         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
    299                          (None, 'd'))
    300         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
    301                          ('a', ''))
    302 
    303         # Tests for bug #1177831: exercise groups other than the first group
    304         p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
    305         self.assertEqual(p.match('abc').groups(),
    306                          ('a', 'b', 'c'))
    307         self.assertEqual(p.match('ad').groups(),
    308                          ('a', None, 'd'))
    309         self.assertIsNone(p.match('abd'))
    310         self.assertIsNone(p.match('ac'))
    311 
    312 
    313     def test_re_groupref(self):
    314         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
    315                          ('|', 'a'))
    316         self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
    317                          (None, 'a'))
    318         self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
    319         self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
    320         self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
    321                          ('a', 'a'))
    322         self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
    323                          (None, None))
    324 
    325     def test_groupdict(self):
    326         self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
    327                                   'first second').groupdict(),
    328                          {'first':'first', 'second':'second'})
    329 
    330     def test_expand(self):
    331         self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
    332                                   "first second")
    333                                   .expand(r"\2 \1 \g<second> \g<first>"),
    334                          "second first second first")
    335 
    336     def test_repeat_minmax(self):
    337         self.assertIsNone(re.match("^(\w){1}$", "abc"))
    338         self.assertIsNone(re.match("^(\w){1}?$", "abc"))
    339         self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
    340         self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
    341 
    342         self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
    343         self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
    344         self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
    345         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
    346         self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
    347         self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
    348         self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
    349         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
    350 
    351         self.assertIsNone(re.match("^x{1}$", "xxx"))
    352         self.assertIsNone(re.match("^x{1}?$", "xxx"))
    353         self.assertIsNone(re.match("^x{1,2}$", "xxx"))
    354         self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
    355 
    356         self.assertTrue(re.match("^x{3}$", "xxx"))
    357         self.assertTrue(re.match("^x{1,3}$", "xxx"))
    358         self.assertTrue(re.match("^x{1,4}$", "xxx"))
    359         self.assertTrue(re.match("^x{3,4}?$", "xxx"))
    360         self.assertTrue(re.match("^x{3}?$", "xxx"))
    361         self.assertTrue(re.match("^x{1,3}?$", "xxx"))
    362         self.assertTrue(re.match("^x{1,4}?$", "xxx"))
    363         self.assertTrue(re.match("^x{3,4}?$", "xxx"))
    364 
    365         self.assertIsNone(re.match("^x{}$", "xxx"))
    366         self.assertTrue(re.match("^x{}$", "x{}"))
    367 
    368     def test_getattr(self):
    369         self.assertEqual(re.match("(a)", "a").pos, 0)
    370         self.assertEqual(re.match("(a)", "a").endpos, 1)
    371         self.assertEqual(re.match("(a)", "a").string, "a")
    372         self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
    373         self.assertTrue(re.match("(a)", "a").re)
    374 
    375     def test_special_escapes(self):
    376         self.assertEqual(re.search(r"\b(b.)\b",
    377                                    "abcd abc bcd bx").group(1), "bx")
    378         self.assertEqual(re.search(r"\B(b.)\B",
    379                                    "abc bcd bc abxd").group(1), "bx")
    380         self.assertEqual(re.search(r"\b(b.)\b",
    381                                    "abcd abc bcd bx", re.LOCALE).group(1), "bx")
    382         self.assertEqual(re.search(r"\B(b.)\B",
    383                                    "abc bcd bc abxd", re.LOCALE).group(1), "bx")
    384         if have_unicode:
    385             self.assertEqual(re.search(r"\b(b.)\b",
    386                                        "abcd abc bcd bx", re.UNICODE).group(1), "bx")
    387             self.assertEqual(re.search(r"\B(b.)\B",
    388                                        "abc bcd bc abxd", re.UNICODE).group(1), "bx")
    389         self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
    390         self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
    391         self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
    392         self.assertEqual(re.search(r"\b(b.)\b",
    393                                    u"abcd abc bcd bx").group(1), "bx")
    394         self.assertEqual(re.search(r"\B(b.)\B",
    395                                    u"abc bcd bc abxd").group(1), "bx")
    396         self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
    397         self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
    398         self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
    399         self.assertEqual(re.search(r"\d\D\w\W\s\S",
    400                                    "1aa! a").group(0), "1aa! a")
    401         self.assertEqual(re.search(r"\d\D\w\W\s\S",
    402                                    "1aa! a", re.LOCALE).group(0), "1aa! a")
    403         if have_unicode:
    404             self.assertEqual(re.search(r"\d\D\w\W\s\S",
    405                                        "1aa! a", re.UNICODE).group(0), "1aa! a")
    406 
    407     def test_string_boundaries(self):
    408         # See http://bugs.python.org/issue10713
    409         self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
    410                          "abc")
    411         # There's a word boundary at the start of a string.
    412         self.assertTrue(re.match(r"\b", "abc"))
    413         # A non-empty string includes a non-boundary zero-length match.
    414         self.assertTrue(re.search(r"\B", "abc"))
    415         # There is no non-boundary match at the start of a string.
    416         self.assertFalse(re.match(r"\B", "abc"))
    417         # However, an empty string contains no word boundaries, and also no
    418         # non-boundaries.
    419         self.assertIsNone(re.search(r"\B", ""))
    420         # This one is questionable and different from the perlre behaviour,
    421         # but describes current behavior.
    422         self.assertIsNone(re.search(r"\b", ""))
    423         # A single word-character string has two boundaries, but no
    424         # non-boundary gaps.
    425         self.assertEqual(len(re.findall(r"\b", "a")), 2)
    426         self.assertEqual(len(re.findall(r"\B", "a")), 0)
    427         # If there are no words, there are no boundaries
    428         self.assertEqual(len(re.findall(r"\b", " ")), 0)
    429         self.assertEqual(len(re.findall(r"\b", "   ")), 0)
    430         # Can match around the whitespace.
    431         self.assertEqual(len(re.findall(r"\B", " ")), 2)
    432 
    433     @requires_unicode
    434     def test_bigcharset(self):
    435         self.assertEqual(re.match(u(r"([\u2222\u2223])"),
    436                                   unichr(0x2222)).group(1), unichr(0x2222))
    437         self.assertEqual(re.match(u(r"([\u2222\u2223])"),
    438                                   unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
    439         r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
    440         self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
    441 
    442     def test_big_codesize(self):
    443         # Issue #1160
    444         r = re.compile('|'.join(('%d'%x for x in range(10000))))
    445         self.assertTrue(r.match('1000'))
    446         self.assertTrue(r.match('9999'))
    447 
    448     def test_anyall(self):
    449         self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
    450                          "a\nb")
    451         self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
    452                          "a\n\nb")
    453 
    454     def test_lookahead(self):
    455         self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
    456         self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
    457         self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
    458         self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
    459         self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
    460         self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
    461         self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
    462 
    463         self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
    464         self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
    465         self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
    466         self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
    467 
    468         # Group reference.
    469         self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
    470         self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
    471         # Named group reference.
    472         self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
    473         self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
    474         # Conditional group reference.
    475         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
    476         self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
    477         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
    478         self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
    479         self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
    480         # Group used before defined.
    481         self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
    482         self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
    483         self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
    484 
    485     def test_lookbehind(self):
    486         self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
    487         self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
    488         self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
    489         self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
    490         # Group reference.
    491         with check_warnings(('', RuntimeWarning)):
    492             re.compile(r'(a)a(?<=\1)c')
    493         # Named group reference.
    494         with check_warnings(('', RuntimeWarning)):
    495             re.compile(r'(?P<g>a)a(?<=(?P=g))c')
    496         # Conditional group reference.
    497         with check_warnings(('', RuntimeWarning)):
    498             re.compile(r'(a)b(?<=(?(1)b|x))c')
    499         # Group used before defined.
    500         with check_warnings(('', RuntimeWarning)):
    501             re.compile(r'(a)b(?<=(?(2)b|x))(c)')
    502 
    503     def test_ignore_case(self):
    504         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
    505         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
    506         self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
    507         self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
    508         self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
    509         self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
    510         self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
    511         self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
    512         self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
    513         self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
    514 
    515         if have_unicode:
    516             assert u(r'\u212a').lower() == u'k' # ''
    517             self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
    518             self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
    519             self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
    520             self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
    521             assert u(r'\u017f').upper() == u'S' # ''
    522             self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
    523             self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
    524             self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
    525             self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
    526 
    527     def test_ignore_case_set(self):
    528         self.assertTrue(re.match(r'[19A]', 'A', re.I))
    529         self.assertTrue(re.match(r'[19a]', 'a', re.I))
    530         self.assertTrue(re.match(r'[19a]', 'A', re.I))
    531         self.assertTrue(re.match(r'[19A]', 'a', re.I))
    532         if have_unicode:
    533             self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
    534             self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
    535             self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
    536             self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
    537             assert u(r'\u212a').lower() == u'k' # ''
    538             self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
    539             self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
    540             self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
    541             self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
    542             assert u(r'\u017f').upper() == u'S' # ''
    543             self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
    544             self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
    545             self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
    546             self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
    547 
    548     def test_ignore_case_range(self):
    549         # Issues #3511, #17381.
    550         self.assertTrue(re.match(r'[9-a]', '_', re.I))
    551         self.assertIsNone(re.match(r'[9-A]', '_', re.I))
    552         self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
    553         self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
    554         self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
    555         self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
    556         if have_unicode:
    557             self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
    558             self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
    559             self.assertTrue(re.match(u(r'[\xc0-\xde]'),
    560                                      u(r'\xd7'), re.U | re.I))
    561             self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
    562                                        u(r'\xf7'), re.U | re.I))
    563             self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
    564                                      u(r'\xf7'), re.U | re.I))
    565             self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
    566                                        u(r'\xd7'), re.U | re.I))
    567             self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
    568                                      u(r'\u0450'), re.U | re.I))
    569             self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
    570                                      u(r'\u0400'), re.U | re.I))
    571             self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
    572                                      u(r'\u0450'), re.U | re.I))
    573             self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
    574                                      u(r'\u0400'), re.U | re.I))
    575             if sys.maxunicode > 0xffff:
    576                 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
    577                                          u(r'\U00010428'), re.U | re.I))
    578                 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
    579                                          u(r'\U00010400'), re.U | re.I))
    580                 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
    581                                          u(r'\U00010428'), re.U | re.I))
    582                 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
    583                                          u(r'\U00010400'), re.U | re.I))
    584 
    585             assert u(r'\u212a').lower() == u'k' # ''
    586             self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
    587             self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
    588             self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
    589             self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
    590             assert u(r'\u017f').upper() == u'S' # ''
    591             self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
    592             self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
    593             self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
    594             self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
    595 
    596     def test_category(self):
    597         self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
    598 
    599     def test_getlower(self):
    600         import _sre
    601         self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
    602         self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
    603         if have_unicode:
    604             self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
    605 
    606         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
    607         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
    608 
    609     def test_not_literal(self):
    610         self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
    611         self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
    612 
    613     def test_search_coverage(self):
    614         self.assertEqual(re.search("\s(b)", " b").group(1), "b")
    615         self.assertEqual(re.search("a\s", "a ").group(0), "a ")
    616 
    617     def assertMatch(self, pattern, text, match=None, span=None,
    618                     matcher=re.match):
    619         if match is None and span is None:
    620             # the pattern matches the whole text
    621             match = text
    622             span = (0, len(text))
    623         elif match is None or span is None:
    624             raise ValueError('If match is not None, span should be specified '
    625                              '(and vice versa).')
    626         m = matcher(pattern, text)
    627         self.assertTrue(m)
    628         self.assertEqual(m.group(), match)
    629         self.assertEqual(m.span(), span)
    630 
    631     @requires_unicode
    632     def test_re_escape(self):
    633         alnum_chars = unicode(string.ascii_letters + string.digits)
    634         p = u''.join(unichr(i) for i in range(256))
    635         for c in p:
    636             if c in alnum_chars:
    637                 self.assertEqual(re.escape(c), c)
    638             elif c == u'\x00':
    639                 self.assertEqual(re.escape(c), u'\\000')
    640             else:
    641                 self.assertEqual(re.escape(c), u'\\' + c)
    642             self.assertMatch(re.escape(c), c)
    643         self.assertMatch(re.escape(p), p)
    644 
    645     def test_re_escape_byte(self):
    646         alnum_chars = string.ascii_letters + string.digits
    647         p = ''.join(chr(i) for i in range(256))
    648         for b in p:
    649             if b in alnum_chars:
    650                 self.assertEqual(re.escape(b), b)
    651             elif b == b'\x00':
    652                 self.assertEqual(re.escape(b), b'\\000')
    653             else:
    654                 self.assertEqual(re.escape(b), b'\\' + b)
    655             self.assertMatch(re.escape(b), b)
    656         self.assertMatch(re.escape(p), p)
    657 
    658     @requires_unicode
    659     def test_re_escape_non_ascii(self):
    660         s = u(r'xxx\u2620\u2620\u2620xxx')
    661         s_escaped = re.escape(s)
    662         self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
    663         self.assertMatch(s_escaped, s)
    664         self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
    665                          u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
    666 
    667     def test_re_escape_non_ascii_bytes(self):
    668         b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
    669         b_escaped = re.escape(b)
    670         self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
    671         self.assertMatch(b_escaped, b)
    672         res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
    673         self.assertEqual(len(res), 2)
    674 
    675     def test_pickling(self):
    676         import pickle
    677         self.pickle_test(pickle)
    678         import cPickle
    679         self.pickle_test(cPickle)
    680         # old pickles expect the _compile() reconstructor in sre module
    681         import_module("sre", deprecated=True)
    682         from sre import _compile
    683         # current pickle expects the _compile() reconstructor in re module
    684         from re import _compile
    685 
    686     def pickle_test(self, pickle):
    687         oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
    688         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
    689             pickled = pickle.dumps(oldpat, proto)
    690             newpat = pickle.loads(pickled)
    691             self.assertEqual(newpat, oldpat)
    692 
    693     def test_constants(self):
    694         self.assertEqual(re.I, re.IGNORECASE)
    695         self.assertEqual(re.L, re.LOCALE)
    696         self.assertEqual(re.M, re.MULTILINE)
    697         self.assertEqual(re.S, re.DOTALL)
    698         self.assertEqual(re.X, re.VERBOSE)
    699 
    700     def test_flags(self):
    701         for flag in [re.I, re.M, re.X, re.S, re.L]:
    702             self.assertTrue(re.compile('^pattern$', flag))
    703 
    704     def test_sre_character_literals(self):
    705         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
    706             self.assertTrue(re.match(r"\%03o" % i, chr(i)))
    707             self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
    708             self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
    709             self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
    710             self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
    711             self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
    712         self.assertRaises(re.error, re.match, "\911", "")
    713 
    714     def test_sre_character_class_literals(self):
    715         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
    716             self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
    717             self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
    718             self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
    719             self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
    720             self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
    721             self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
    722         self.assertRaises(re.error, re.match, "[\911]", "")
    723 
    724     def test_bug_113254(self):
    725         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
    726         self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
    727         self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
    728 
    729     def test_bug_527371(self):
    730         # bug described in patches 527371/672491
    731         self.assertIsNone(re.match(r'(a)?a','a').lastindex)
    732         self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
    733         self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
    734         self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
    735         self.assertEqual(re.match("((a))", "a").lastindex, 1)
    736 
    737     def test_bug_545855(self):
    738         # bug 545855 -- This pattern failed to cause a compile error as it
    739         # should, instead provoking a TypeError.
    740         self.assertRaises(re.error, re.compile, 'foo[a-')
    741 
    742     def test_bug_418626(self):
    743         # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
    744         # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
    745         # pattern '*?' on a long string.
    746         self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
    747         self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
    748                          20003)
    749         self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
    750         # non-simple '*?' still used to hit the recursion limit, before the
    751         # non-recursive scheme was implemented.
    752         self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
    753 
    754     @requires_unicode
    755     def test_bug_612074(self):
    756         pat=u"["+re.escape(unichr(0x2039))+u"]"
    757         self.assertEqual(re.compile(pat) and 1, 1)
    758 
    759     def test_stack_overflow(self):
    760         # nasty cases that used to overflow the straightforward recursive
    761         # implementation of repeated groups.
    762         self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
    763         self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
    764         self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
    765 
    766     def test_unlimited_zero_width_repeat(self):
    767         # Issue #9669
    768         self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
    769         self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
    770         self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
    771         self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
    772         self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
    773         self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
    774 
    775     def test_scanner(self):
    776         def s_ident(scanner, token): return token
    777         def s_operator(scanner, token): return "op%s" % token
    778         def s_float(scanner, token): return float(token)
    779         def s_int(scanner, token): return int(token)
    780 
    781         scanner = Scanner([
    782             (r"[a-zA-Z_]\w*", s_ident),
    783             (r"\d+\.\d*", s_float),
    784             (r"\d+", s_int),
    785             (r"=|\+|-|\*|/", s_operator),
    786             (r"\s+", None),
    787             ])
    788 
    789         self.assertTrue(scanner.scanner.scanner("").pattern)
    790 
    791         self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
    792                          (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
    793                            'op+', 'bar'], ''))
    794 
    795     def test_bug_448951(self):
    796         # bug 448951 (similar to 429357, but with single char match)
    797         # (Also test greedy matches.)
    798         for op in '','?','*':
    799             self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
    800                              (None, None))
    801             self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
    802                              ('a:', 'a'))
    803 
    804     def test_bug_725106(self):
    805         # capturing groups in alternatives in repeats
    806         self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
    807                          ('b', 'a'))
    808         self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
    809                          ('c', 'b'))
    810         self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
    811                          ('b', None))
    812         self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
    813                          ('b', None))
    814         self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
    815                          ('b', 'a'))
    816         self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
    817                          ('c', 'b'))
    818         self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
    819                          ('b', None))
    820         self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
    821                          ('b', None))
    822 
    823     def test_bug_725149(self):
    824         # mark_stack_base restoring before restoring marks
    825         self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
    826                          ('a', None))
    827         self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
    828                          ('a', None, None))
    829 
    830     @requires_unicode
    831     def test_bug_764548(self):
    832         # bug 764548, re.compile() barfs on str/unicode subclasses
    833         class my_unicode(unicode): pass
    834         pat = re.compile(my_unicode("abc"))
    835         self.assertIsNone(pat.match("xyz"))
    836 
    837     def test_finditer(self):
    838         iter = re.finditer(r":+", "a:b::c:::d")
    839         self.assertEqual([item.group(0) for item in iter],
    840                          [":", "::", ":::"])
    841 
    842     @requires_unicode
    843     def test_bug_926075(self):
    844         self.assertIsNot(re.compile('bug_926075'),
    845                          re.compile(u'bug_926075'))
    846 
    847     @requires_unicode
    848     def test_bug_931848(self):
    849         pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
    850         self.assertEqual(re.compile(pattern).split("a.b.c"),
    851                          ['a','b','c'])
    852 
    853     def test_bug_581080(self):
    854         iter = re.finditer(r"\s", "a b")
    855         self.assertEqual(iter.next().span(), (1,2))
    856         self.assertRaises(StopIteration, iter.next)
    857 
    858         scanner = re.compile(r"\s").scanner("a b")
    859         self.assertEqual(scanner.search().span(), (1, 2))
    860         self.assertIsNone(scanner.search())
    861 
    862     def test_bug_817234(self):
    863         iter = re.finditer(r".*", "asdf")
    864         self.assertEqual(iter.next().span(), (0, 4))
    865         self.assertEqual(iter.next().span(), (4, 4))
    866         self.assertRaises(StopIteration, iter.next)
    867 
    868     @requires_unicode
    869     def test_bug_6561(self):
    870         # '\d' should match characters in Unicode category 'Nd'
    871         # (Number, Decimal Digit), but not those in 'Nl' (Number,
    872         # Letter) or 'No' (Number, Other).
    873         decimal_digits = [
    874             unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
    875             unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
    876             unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
    877             ]
    878         for x in decimal_digits:
    879             self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
    880 
    881         not_decimal_digits = [
    882             unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
    883             unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
    884             unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
    885             unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
    886             ]
    887         for x in not_decimal_digits:
    888             self.assertIsNone(re.match('^\d$', x, re.UNICODE))
    889 
    890     def test_empty_array(self):
    891         # SF buf 1647541
    892         import array
    893         typecodes = 'cbBhHiIlLfd'
    894         if have_unicode:
    895             typecodes += 'u'
    896         for typecode in typecodes:
    897             a = array.array(typecode)
    898             self.assertIsNone(re.compile("bla").match(a))
    899             self.assertEqual(re.compile("").match(a).groups(), ())
    900 
    901     @requires_unicode
    902     def test_inline_flags(self):
    903         # Bug #1700
    904         upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
    905         lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
    906 
    907         p = re.compile(upper_char, re.I | re.U)
    908         q = p.match(lower_char)
    909         self.assertTrue(q)
    910 
    911         p = re.compile(lower_char, re.I | re.U)
    912         q = p.match(upper_char)
    913         self.assertTrue(q)
    914 
    915         p = re.compile('(?i)' + upper_char, re.U)
    916         q = p.match(lower_char)
    917         self.assertTrue(q)
    918 
    919         p = re.compile('(?i)' + lower_char, re.U)
    920         q = p.match(upper_char)
    921         self.assertTrue(q)
    922 
    923         p = re.compile('(?iu)' + upper_char)
    924         q = p.match(lower_char)
    925         self.assertTrue(q)
    926 
    927         p = re.compile('(?iu)' + lower_char)
    928         q = p.match(upper_char)
    929         self.assertTrue(q)
    930 
    931         self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
    932         self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
    933 
    934     def test_dollar_matches_twice(self):
    935         "$ matches the end of string, and just before the terminating \n"
    936         pattern = re.compile('$')
    937         self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
    938         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
    939         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
    940 
    941         pattern = re.compile('$', re.MULTILINE)
    942         self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
    943         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
    944         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
    945 
    946     def test_dealloc(self):
    947         # issue 3299: check for segfault in debug build
    948         import _sre
    949         # the overflow limit is different on wide and narrow builds and it
    950         # depends on the definition of SRE_CODE (see sre.h).
    951         # 2**128 should be big enough to overflow on both. For smaller values
    952         # a RuntimeError is raised instead of OverflowError.
    953         long_overflow = 2**128
    954         self.assertRaises(TypeError, re.finditer, "a", {})
    955         self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
    956 
    957     def test_compile(self):
    958         # Test return value when given string and pattern as parameter
    959         pattern = re.compile('random pattern')
    960         self.assertIsInstance(pattern, re._pattern_type)
    961         same_pattern = re.compile(pattern)
    962         self.assertIsInstance(same_pattern, re._pattern_type)
    963         self.assertIs(same_pattern, pattern)
    964         # Test behaviour when not given a string or pattern as parameter
    965         self.assertRaises(TypeError, re.compile, 0)
    966 
    967     def test_bug_13899(self):
    968         # Issue #13899: re pattern r"[\A]" should work like "A" but matches
    969         # nothing. Ditto B and Z.
    970         self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
    971                          ['A', 'B', '\b', 'C', 'Z'])
    972 
    973     @precisionbigmemtest(size=_2G, memuse=1)
    974     def test_large_search(self, size):
    975         # Issue #10182: indices were 32-bit-truncated.
    976         s = 'a' * size
    977         m = re.search('$', s)
    978         self.assertIsNotNone(m)
    979         self.assertEqual(m.start(), size)
    980         self.assertEqual(m.end(), size)
    981 
    982     # The huge memuse is because of re.sub() using a list and a join()
    983     # to create the replacement result.
    984     @precisionbigmemtest(size=_2G, memuse=16 + 2)
    985     def test_large_subn(self, size):
    986         # Issue #10182: indices were 32-bit-truncated.
    987         s = 'a' * size
    988         r, n = re.subn('', '', s)
    989         self.assertEqual(r, s)
    990         self.assertEqual(n, size + 1)
    991 
    992 
    993     def test_repeat_minmax_overflow(self):
    994         # Issue #13169
    995         string = "x" * 100000
    996         self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
    997         self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
    998         self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
    999         self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
   1000         self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
   1001         self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
   1002         # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
   1003         self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
   1004         self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
   1005         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
   1006         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
   1007 
   1008     @cpython_only
   1009     def test_repeat_minmax_overflow_maxrepeat(self):
   1010         try:
   1011             from _sre import MAXREPEAT
   1012         except ImportError:
   1013             self.skipTest('requires _sre.MAXREPEAT constant')
   1014         string = "x" * 100000
   1015         self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
   1016         self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
   1017                          (0, 100000))
   1018         self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
   1019         self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
   1020         self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
   1021         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
   1022 
   1023     def test_backref_group_name_in_exception(self):
   1024         # Issue 17341: Poor error message when compiling invalid regex
   1025         with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
   1026             re.compile('(?P=<foo>)')
   1027 
   1028     def test_group_name_in_exception(self):
   1029         # Issue 17341: Poor error message when compiling invalid regex
   1030         with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
   1031             re.compile('(?P<?foo>)')
   1032 
   1033     def test_issue17998(self):
   1034         for reps in '*', '+', '?', '{1}':
   1035             for mod in '', '?':
   1036                 pattern = '.' + reps + mod + 'yz'
   1037                 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
   1038                                  ['xyz'], msg=pattern)
   1039                 if have_unicode:
   1040                     pattern = unicode(pattern)
   1041                     self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
   1042                                      [u'xyz'], msg=pattern)
   1043 
   1044 
   1045     def test_bug_2537(self):
   1046         # issue 2537: empty submatches
   1047         for outer_op in ('{0,}', '*', '+', '{1,187}'):
   1048             for inner_op in ('{0,}', '*', '?'):
   1049                 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
   1050                 m = r.match("xyyzy")
   1051                 self.assertEqual(m.group(0), "xyy")
   1052                 self.assertEqual(m.group(1), "")
   1053                 self.assertEqual(m.group(2), "y")
   1054 
   1055     def test_debug_flag(self):
   1056         pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
   1057         with captured_stdout() as out:
   1058             re.compile(pat, re.DEBUG)
   1059         dump = '''\
   1060 subpattern 1
   1061   literal 46
   1062 subpattern None
   1063   branch
   1064     in
   1065       literal 99
   1066       literal 104
   1067   or
   1068     literal 112
   1069     literal 121
   1070 subpattern None
   1071   groupref_exists 1
   1072     at at_end
   1073   else
   1074     literal 58
   1075     literal 32
   1076 '''
   1077         self.assertEqual(out.getvalue(), dump)
   1078         # Debug output is output again even a second time (bypassing
   1079         # the cache -- issue #20426).
   1080         with captured_stdout() as out:
   1081             re.compile(pat, re.DEBUG)
   1082         self.assertEqual(out.getvalue(), dump)
   1083 
   1084     def test_keyword_parameters(self):
   1085         # Issue #20283: Accepting the string keyword parameter.
   1086         pat = re.compile(r'(ab)')
   1087         self.assertEqual(
   1088             pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
   1089         self.assertEqual(
   1090             pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
   1091         self.assertEqual(
   1092             pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
   1093         self.assertEqual(
   1094             pat.split(string='abracadabra', maxsplit=1),
   1095             ['', 'ab', 'racadabra'])
   1096 
   1097     def test_match_group_takes_long(self):
   1098         self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
   1099         self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
   1100 
   1101     def test_locale_caching(self):
   1102         # Issue #22410
   1103         oldlocale = locale.setlocale(locale.LC_CTYPE)
   1104         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1105         for loc in 'en_US.iso88591', 'en_US.utf8':
   1106             try:
   1107                 locale.setlocale(locale.LC_CTYPE, loc)
   1108             except locale.Error:
   1109                 # Unsupported locale on this system
   1110                 self.skipTest('test needs %s locale' % loc)
   1111 
   1112         re.purge()
   1113         self.check_en_US_iso88591()
   1114         self.check_en_US_utf8()
   1115         re.purge()
   1116         self.check_en_US_utf8()
   1117         self.check_en_US_iso88591()
   1118 
   1119     def check_en_US_iso88591(self):
   1120         locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
   1121         self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
   1122         self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
   1123         self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
   1124         self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
   1125         self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
   1126         self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
   1127 
   1128     def check_en_US_utf8(self):
   1129         locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
   1130         self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
   1131         self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
   1132         self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
   1133         self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
   1134         self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
   1135         self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
   1136 
   1137 
   1138 def run_re_tests():
   1139     from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
   1140     if verbose:
   1141         print 'Running re_tests test suite'
   1142     else:
   1143         # To save time, only run the first and last 10 tests
   1144         #tests = tests[:10] + tests[-10:]
   1145         pass
   1146 
   1147     for t in tests:
   1148         sys.stdout.flush()
   1149         pattern = s = outcome = repl = expected = None
   1150         if len(t) == 5:
   1151             pattern, s, outcome, repl, expected = t
   1152         elif len(t) == 3:
   1153             pattern, s, outcome = t
   1154         else:
   1155             raise ValueError, ('Test tuples should have 3 or 5 fields', t)
   1156 
   1157         try:
   1158             obj = re.compile(pattern)
   1159         except re.error:
   1160             if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
   1161             else:
   1162                 print '=== Syntax error:', t
   1163         except KeyboardInterrupt: raise KeyboardInterrupt
   1164         except:
   1165             print '*** Unexpected error ***', t
   1166             if verbose:
   1167                 traceback.print_exc(file=sys.stdout)
   1168         else:
   1169             try:
   1170                 result = obj.search(s)
   1171             except re.error, msg:
   1172                 print '=== Unexpected exception', t, repr(msg)
   1173             if outcome == SYNTAX_ERROR:
   1174                 # This should have been a syntax error; forget it.
   1175                 pass
   1176             elif outcome == FAIL:
   1177                 if result is None: pass   # No match, as expected
   1178                 else: print '=== Succeeded incorrectly', t
   1179             elif outcome == SUCCEED:
   1180                 if result is not None:
   1181                     # Matched, as expected, so now we compute the
   1182                     # result string and compare it to our expected result.
   1183                     start, end = result.span(0)
   1184                     vardict={'found': result.group(0),
   1185                              'groups': result.group(),
   1186                              'flags': result.re.flags}
   1187                     for i in range(1, 100):
   1188                         try:
   1189                             gi = result.group(i)
   1190                             # Special hack because else the string concat fails:
   1191                             if gi is None:
   1192                                 gi = "None"
   1193                         except IndexError:
   1194                             gi = "Error"
   1195                         vardict['g%d' % i] = gi
   1196                     for i in result.re.groupindex.keys():
   1197                         try:
   1198                             gi = result.group(i)
   1199                             if gi is None:
   1200                                 gi = "None"
   1201                         except IndexError:
   1202                             gi = "Error"
   1203                         vardict[i] = gi
   1204                     repl = eval(repl, vardict)
   1205                     if repl != expected:
   1206                         print '=== grouping error', t,
   1207                         print repr(repl) + ' should be ' + repr(expected)
   1208                 else:
   1209                     print '=== Failed incorrectly', t
   1210 
   1211                 # Try the match on a unicode string, and check that it
   1212                 # still succeeds.
   1213                 try:
   1214                     result = obj.search(unicode(s, "latin-1"))
   1215                     if result is None:
   1216                         print '=== Fails on unicode match', t
   1217                 except NameError:
   1218                     continue # 1.5.2
   1219                 except TypeError:
   1220                     continue # unicode test case
   1221 
   1222                 # Try the match on a unicode pattern, and check that it
   1223                 # still succeeds.
   1224                 obj=re.compile(unicode(pattern, "latin-1"))
   1225                 result = obj.search(s)
   1226                 if result is None:
   1227                     print '=== Fails on unicode pattern match', t
   1228 
   1229                 # Try the match with the search area limited to the extent
   1230                 # of the match and see if it still succeeds.  \B will
   1231                 # break (because it won't match at the end or start of a
   1232                 # string), so we'll ignore patterns that feature it.
   1233 
   1234                 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
   1235                                and result is not None:
   1236                     obj = re.compile(pattern)
   1237                     result = obj.search(s, result.start(0), result.end(0) + 1)
   1238                     if result is None:
   1239                         print '=== Failed on range-limited match', t
   1240 
   1241                 # Try the match with IGNORECASE enabled, and check that it
   1242                 # still succeeds.
   1243                 obj = re.compile(pattern, re.IGNORECASE)
   1244                 result = obj.search(s)
   1245                 if result is None:
   1246                     print '=== Fails on case-insensitive match', t
   1247 
   1248                 # Try the match with LOCALE enabled, and check that it
   1249                 # still succeeds.
   1250                 obj = re.compile(pattern, re.LOCALE)
   1251                 result = obj.search(s)
   1252                 if result is None:
   1253                     print '=== Fails on locale-sensitive match', t
   1254 
   1255                 # Try the match with UNICODE locale enabled, and check
   1256                 # that it still succeeds.
   1257                 obj = re.compile(pattern, re.UNICODE)
   1258                 result = obj.search(s)
   1259                 if result is None:
   1260                     print '=== Fails on unicode-sensitive match', t
   1261 
   1262 def test_main():
   1263     run_unittest(ReTests)
   1264     run_re_tests()
   1265 
   1266 if __name__ == "__main__":
   1267     test_main()
   1268