Home | History | Annotate | Download | only in test
      1 """ Test script for the Unicode implementation.
      2 
      3 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      4 
      5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      6 
      7 """#"
      8 import sys
      9 import struct
     10 import codecs
     11 import unittest
     12 from test import test_support, string_tests
     13 
     14 # decorator to skip tests on narrow builds
     15 requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
     16                                       'requires wide build')
     17 
     18 # Error handling (bad decoder return)
     19 def search_function(encoding):
     20     def decode1(input, errors="strict"):
     21         return 42 # not a tuple
     22     def encode1(input, errors="strict"):
     23         return 42 # not a tuple
     24     def encode2(input, errors="strict"):
     25         return (42, 42) # no unicode
     26     def decode2(input, errors="strict"):
     27         return (42, 42) # no unicode
     28     if encoding=="test.unicode1":
     29         return (encode1, decode1, None, None)
     30     elif encoding=="test.unicode2":
     31         return (encode2, decode2, None, None)
     32     else:
     33         return None
     34 codecs.register(search_function)
     35 
     36 class UnicodeTest(
     37     string_tests.CommonTest,
     38     string_tests.MixinStrUnicodeUserStringTest,
     39     string_tests.MixinStrUnicodeTest,
     40     ):
     41     type2test = unicode
     42 
     43     def assertEqual(self, first, second, msg=None):
     44         # strict assertEqual method: reject implicit bytes/unicode equality
     45         super(UnicodeTest, self).assertEqual(first, second, msg)
     46         if isinstance(first, unicode) or isinstance(second, unicode):
     47             self.assertIsInstance(first, unicode)
     48             self.assertIsInstance(second, unicode)
     49         elif isinstance(first, str) or isinstance(second, str):
     50             self.assertIsInstance(first, str)
     51             self.assertIsInstance(second, str)
     52 
     53     def checkequalnofix(self, result, object, methodname, *args):
     54         method = getattr(object, methodname)
     55         realresult = method(*args)
     56         self.assertEqual(realresult, result)
     57         self.assertTrue(type(realresult) is type(result))
     58 
     59         # if the original is returned make sure that
     60         # this doesn't happen with subclasses
     61         if realresult is object:
     62             class usub(unicode):
     63                 def __repr__(self):
     64                     return 'usub(%r)' % unicode.__repr__(self)
     65             object = usub(object)
     66             method = getattr(object, methodname)
     67             realresult = method(*args)
     68             self.assertEqual(realresult, result)
     69             self.assertTrue(object is not realresult)
     70 
     71     def test_literals(self):
     72         self.assertEqual(u'\xff', u'\u00ff')
     73         self.assertEqual(u'\uffff', u'\U0000ffff')
     74         self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
     75         self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
     76         self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
     77 
     78     def test_repr(self):
     79         if not sys.platform.startswith('java'):
     80             # Test basic sanity of repr()
     81             self.assertEqual(repr(u'abc'), "u'abc'")
     82             self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
     83             self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
     84             self.assertEqual(repr(u'\\c'), "u'\\\\c'")
     85             self.assertEqual(repr(u'\\'), "u'\\\\'")
     86             self.assertEqual(repr(u'\n'), "u'\\n'")
     87             self.assertEqual(repr(u'\r'), "u'\\r'")
     88             self.assertEqual(repr(u'\t'), "u'\\t'")
     89             self.assertEqual(repr(u'\b'), "u'\\x08'")
     90             self.assertEqual(repr(u"'\""), """u'\\'"'""")
     91             self.assertEqual(repr(u"'\""), """u'\\'"'""")
     92             self.assertEqual(repr(u"'"), '''u"'"''')
     93             self.assertEqual(repr(u'"'), """u'"'""")
     94             latin1repr = (
     95                 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
     96                 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
     97                 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
     98                 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
     99                 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
    100                 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
    101                 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
    102                 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
    103                 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
    104                 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
    105                 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
    106                 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
    107                 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
    108                 "\\xfe\\xff'")
    109             testrepr = repr(u''.join(map(unichr, xrange(256))))
    110             self.assertEqual(testrepr, latin1repr)
    111             # Test repr works on wide unicode escapes without overflow.
    112             self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
    113                              repr(u"\U00010000" * 39 + u"\uffff" * 4096))
    114 
    115 
    116     def test_count(self):
    117         string_tests.CommonTest.test_count(self)
    118         # check mixed argument types
    119         self.checkequalnofix(3,  'aaa', 'count', u'a')
    120         self.checkequalnofix(0,  'aaa', 'count', u'b')
    121         self.checkequalnofix(3, u'aaa', 'count',  'a')
    122         self.checkequalnofix(0, u'aaa', 'count',  'b')
    123         self.checkequalnofix(0, u'aaa', 'count',  'b')
    124         self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
    125         self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
    126         self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
    127         self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
    128 
    129     def test_find(self):
    130         self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
    131         self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
    132         self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
    133 
    134         self.assertRaises(TypeError, u'hello'.find)
    135         self.assertRaises(TypeError, u'hello'.find, 42)
    136 
    137     def test_rfind(self):
    138         string_tests.CommonTest.test_rfind(self)
    139         # check mixed argument types
    140         self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
    141         self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
    142         self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
    143 
    144     def test_index(self):
    145         string_tests.CommonTest.test_index(self)
    146         # check mixed argument types
    147         for (t1, t2) in ((str, unicode), (unicode, str)):
    148             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
    149             self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
    150             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
    151             self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
    152             self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
    153             self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
    154             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
    155             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
    156 
    157     def test_rindex(self):
    158         string_tests.CommonTest.test_rindex(self)
    159         # check mixed argument types
    160         for (t1, t2) in ((str, unicode), (unicode, str)):
    161             self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
    162             self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
    163             self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
    164             self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
    165 
    166             self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
    167             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
    168             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
    169             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
    170             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
    171 
    172     def test_translate(self):
    173         self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
    174         self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
    175         self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
    176         self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
    177         self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
    178         self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
    179 
    180         self.assertRaises(TypeError, u'hello'.translate)
    181         self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
    182 
    183     def test_split(self):
    184         string_tests.CommonTest.test_split(self)
    185 
    186         # Mixed arguments
    187         self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
    188         self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
    189         self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
    190 
    191     def test_join(self):
    192         string_tests.MixinStrUnicodeUserStringTest.test_join(self)
    193 
    194         # mixed arguments
    195         self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
    196         self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
    197         self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
    198         self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
    199         self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
    200         self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
    201         self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
    202 
    203     def test_strip(self):
    204         string_tests.CommonTest.test_strip(self)
    205         self.assertRaises(UnicodeError, u"hello".strip, "\xff")
    206 
    207     def test_replace(self):
    208         string_tests.CommonTest.test_replace(self)
    209 
    210         # method call forwarded from str implementation because of unicode argument
    211         self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
    212         self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
    213 
    214     def test_comparison(self):
    215         # Comparisons:
    216         self.assertTrue(u'abc' == 'abc')
    217         self.assertTrue('abc' == u'abc')
    218         self.assertTrue(u'abc' == u'abc')
    219         self.assertTrue(u'abcd' > 'abc')
    220         self.assertTrue('abcd' > u'abc')
    221         self.assertTrue(u'abcd' > u'abc')
    222         self.assertTrue(u'abc' < 'abcd')
    223         self.assertTrue('abc' < u'abcd')
    224         self.assertTrue(u'abc' < u'abcd')
    225 
    226         if 0:
    227             # Move these tests to a Unicode collation module test...
    228             # Testing UTF-16 code point order comparisons...
    229 
    230             # No surrogates, no fixup required.
    231             self.assertTrue(u'\u0061' < u'\u20ac')
    232             # Non surrogate below surrogate value, no fixup required
    233             self.assertTrue(u'\u0061' < u'\ud800\udc02')
    234 
    235             # Non surrogate above surrogate value, fixup required
    236             def test_lecmp(s, s2):
    237                 self.assertTrue(s < s2)
    238 
    239             def test_fixup(s):
    240                 s2 = u'\ud800\udc01'
    241                 test_lecmp(s, s2)
    242                 s2 = u'\ud900\udc01'
    243                 test_lecmp(s, s2)
    244                 s2 = u'\uda00\udc01'
    245                 test_lecmp(s, s2)
    246                 s2 = u'\udb00\udc01'
    247                 test_lecmp(s, s2)
    248                 s2 = u'\ud800\udd01'
    249                 test_lecmp(s, s2)
    250                 s2 = u'\ud900\udd01'
    251                 test_lecmp(s, s2)
    252                 s2 = u'\uda00\udd01'
    253                 test_lecmp(s, s2)
    254                 s2 = u'\udb00\udd01'
    255                 test_lecmp(s, s2)
    256                 s2 = u'\ud800\ude01'
    257                 test_lecmp(s, s2)
    258                 s2 = u'\ud900\ude01'
    259                 test_lecmp(s, s2)
    260                 s2 = u'\uda00\ude01'
    261                 test_lecmp(s, s2)
    262                 s2 = u'\udb00\ude01'
    263                 test_lecmp(s, s2)
    264                 s2 = u'\ud800\udfff'
    265                 test_lecmp(s, s2)
    266                 s2 = u'\ud900\udfff'
    267                 test_lecmp(s, s2)
    268                 s2 = u'\uda00\udfff'
    269                 test_lecmp(s, s2)
    270                 s2 = u'\udb00\udfff'
    271                 test_lecmp(s, s2)
    272 
    273                 test_fixup(u'\ue000')
    274                 test_fixup(u'\uff61')
    275 
    276         # Surrogates on both sides, no fixup required
    277         self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
    278 
    279     def test_capitalize(self):
    280         string_tests.CommonTest.test_capitalize(self)
    281         # check that titlecased chars are lowered correctly
    282         # \u1ffc is the titlecased char
    283         self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
    284                         u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
    285         # check with cased non-letter chars
    286         self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
    287                         u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
    288         self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
    289                         u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
    290         self.checkequal(u'\u2160\u2171\u2172',
    291                         u'\u2160\u2161\u2162', 'capitalize')
    292         self.checkequal(u'\u2160\u2171\u2172',
    293                         u'\u2170\u2171\u2172', 'capitalize')
    294         # check with Ll chars with no upper - nothing changes here
    295         self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
    296                         u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
    297 
    298     def test_islower(self):
    299         string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
    300         self.checkequalnofix(False, u'\u1FFc', 'islower')
    301 
    302     @requires_wide_build
    303     def test_islower_non_bmp(self):
    304         # non-BMP, uppercase
    305         self.assertFalse(u'\U00010401'.islower())
    306         self.assertFalse(u'\U00010427'.islower())
    307         # non-BMP, lowercase
    308         self.assertTrue(u'\U00010429'.islower())
    309         self.assertTrue(u'\U0001044E'.islower())
    310         # non-BMP, non-cased
    311         self.assertFalse(u'\U0001F40D'.islower())
    312         self.assertFalse(u'\U0001F46F'.islower())
    313 
    314     def test_isupper(self):
    315         string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
    316         if not sys.platform.startswith('java'):
    317             self.checkequalnofix(False, u'\u1FFc', 'isupper')
    318 
    319     @requires_wide_build
    320     def test_isupper_non_bmp(self):
    321         # non-BMP, uppercase
    322         self.assertTrue(u'\U00010401'.isupper())
    323         self.assertTrue(u'\U00010427'.isupper())
    324         # non-BMP, lowercase
    325         self.assertFalse(u'\U00010429'.isupper())
    326         self.assertFalse(u'\U0001044E'.isupper())
    327         # non-BMP, non-cased
    328         self.assertFalse(u'\U0001F40D'.isupper())
    329         self.assertFalse(u'\U0001F46F'.isupper())
    330 
    331     def test_istitle(self):
    332         string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
    333         self.checkequalnofix(True, u'\u1FFc', 'istitle')
    334         self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
    335 
    336     @requires_wide_build
    337     def test_istitle_non_bmp(self):
    338         # non-BMP, uppercase + lowercase
    339         self.assertTrue(u'\U00010401\U00010429'.istitle())
    340         self.assertTrue(u'\U00010427\U0001044E'.istitle())
    341         # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
    342         for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
    343             self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
    344 
    345     def test_isspace(self):
    346         string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
    347         self.checkequalnofix(True, u'\u2000', 'isspace')
    348         self.checkequalnofix(True, u'\u200a', 'isspace')
    349         self.checkequalnofix(False, u'\u2014', 'isspace')
    350 
    351     @requires_wide_build
    352     def test_isspace_non_bmp(self):
    353         # apparently there are no non-BMP spaces chars in Unicode 6
    354         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    355                    u'\U0001F40D', u'\U0001F46F']:
    356             self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
    357 
    358     @requires_wide_build
    359     def test_isalnum_non_bmp(self):
    360         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    361                    u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
    362             self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
    363 
    364     def test_isalpha(self):
    365         string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
    366         self.checkequalnofix(True, u'\u1FFc', 'isalpha')
    367 
    368     @requires_wide_build
    369     def test_isalpha_non_bmp(self):
    370         # non-BMP, cased
    371         self.assertTrue(u'\U00010401'.isalpha())
    372         self.assertTrue(u'\U00010427'.isalpha())
    373         self.assertTrue(u'\U00010429'.isalpha())
    374         self.assertTrue(u'\U0001044E'.isalpha())
    375         # non-BMP, non-cased
    376         self.assertFalse(u'\U0001F40D'.isalpha())
    377         self.assertFalse(u'\U0001F46F'.isalpha())
    378 
    379     def test_isdecimal(self):
    380         self.checkequalnofix(False, u'', 'isdecimal')
    381         self.checkequalnofix(False, u'a', 'isdecimal')
    382         self.checkequalnofix(True, u'0', 'isdecimal')
    383         self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
    384         self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
    385         self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
    386         self.checkequalnofix(True, u'0123456789', 'isdecimal')
    387         self.checkequalnofix(False, u'0123456789a', 'isdecimal')
    388 
    389         self.checkraises(TypeError, 'abc', 'isdecimal', 42)
    390 
    391     @requires_wide_build
    392     def test_isdecimal_non_bmp(self):
    393         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    394                    u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
    395             self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
    396         for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
    397             self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
    398 
    399     def test_isdigit(self):
    400         string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
    401         self.checkequalnofix(True, u'\u2460', 'isdigit')
    402         self.checkequalnofix(False, u'\xbc', 'isdigit')
    403         self.checkequalnofix(True, u'\u0660', 'isdigit')
    404 
    405     @requires_wide_build
    406     def test_isdigit_non_bmp(self):
    407         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    408                    u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
    409             self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
    410         for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
    411             self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
    412 
    413     def test_isnumeric(self):
    414         self.checkequalnofix(False, u'', 'isnumeric')
    415         self.checkequalnofix(False, u'a', 'isnumeric')
    416         self.checkequalnofix(True, u'0', 'isnumeric')
    417         self.checkequalnofix(True, u'\u2460', 'isnumeric')
    418         self.checkequalnofix(True, u'\xbc', 'isnumeric')
    419         self.checkequalnofix(True, u'\u0660', 'isnumeric')
    420         self.checkequalnofix(True, u'0123456789', 'isnumeric')
    421         self.checkequalnofix(False, u'0123456789a', 'isnumeric')
    422 
    423         self.assertRaises(TypeError, u"abc".isnumeric, 42)
    424 
    425     @requires_wide_build
    426     def test_isnumeric_non_bmp(self):
    427         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    428                    u'\U0001F40D', u'\U0001F46F']:
    429             self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
    430         for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
    431                    u'\U000104A0', u'\U0001F107']:
    432             self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
    433 
    434     @requires_wide_build
    435     def test_surrogates(self):
    436         # this test actually passes on narrow too, but it's just by accident.
    437         # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
    438         # uppercase as 'X X'
    439         for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
    440                   u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
    441             self.assertTrue(s.islower())
    442             self.assertFalse(s.isupper())
    443             self.assertFalse(s.istitle())
    444         for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
    445                   u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
    446             self.assertFalse(s.islower())
    447             self.assertTrue(s.isupper())
    448             self.assertTrue(s.istitle())
    449 
    450         for meth_name in ('islower', 'isupper', 'istitle'):
    451             meth = getattr(unicode, meth_name)
    452             for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
    453                 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
    454 
    455         for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
    456                           'isdecimal', 'isnumeric'):
    457             meth = getattr(unicode, meth_name)
    458             for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
    459                       u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
    460                       u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
    461                 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
    462 
    463 
    464     @requires_wide_build
    465     def test_lower(self):
    466         string_tests.CommonTest.test_lower(self)
    467         self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
    468         self.assertEqual(u'\U00010427\U00010427'.lower(),
    469                          u'\U0001044F\U0001044F')
    470         self.assertEqual(u'\U00010427\U0001044F'.lower(),
    471                          u'\U0001044F\U0001044F')
    472         self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
    473                          u'x\U0001044Fx\U0001044F')
    474 
    475     @requires_wide_build
    476     def test_upper(self):
    477         string_tests.CommonTest.test_upper(self)
    478         self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
    479         self.assertEqual(u'\U0001044F\U0001044F'.upper(),
    480                          u'\U00010427\U00010427')
    481         self.assertEqual(u'\U00010427\U0001044F'.upper(),
    482                          u'\U00010427\U00010427')
    483         self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
    484                          u'X\U00010427X\U00010427')
    485 
    486     @requires_wide_build
    487     def test_capitalize(self):
    488         string_tests.CommonTest.test_capitalize(self)
    489         self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
    490         self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
    491                          u'\U00010427\U0001044F')
    492         self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
    493                          u'\U00010427\U0001044F')
    494         self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
    495                          u'\U00010427\U0001044F')
    496         self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
    497                          u'X\U0001044Fx\U0001044F')
    498 
    499     @requires_wide_build
    500     def test_title(self):
    501         string_tests.MixinStrUnicodeUserStringTest.test_title(self)
    502         self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
    503         self.assertEqual(u'\U0001044F\U0001044F'.title(),
    504                          u'\U00010427\U0001044F')
    505         self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
    506                          u'\U00010427\U0001044F \U00010427\U0001044F')
    507         self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
    508                          u'\U00010427\U0001044F \U00010427\U0001044F')
    509         self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
    510                          u'\U00010427\U0001044F \U00010427\U0001044F')
    511         self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
    512                          u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
    513 
    514     @requires_wide_build
    515     def test_swapcase(self):
    516         string_tests.CommonTest.test_swapcase(self)
    517         self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
    518         self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
    519         self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
    520                          u'\U00010427\U00010427')
    521         self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
    522                          u'\U0001044F\U00010427')
    523         self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
    524                          u'\U00010427\U0001044F')
    525         self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
    526                          u'x\U0001044FX\U00010427')
    527 
    528     def test_contains(self):
    529         # Testing Unicode contains method
    530         self.assertIn('a', u'abdb')
    531         self.assertIn('a', u'bdab')
    532         self.assertIn('a', u'bdaba')
    533         self.assertIn('a', u'bdba')
    534         self.assertIn('a', u'bdba')
    535         self.assertIn(u'a', u'bdba')
    536         self.assertNotIn(u'a', u'bdb')
    537         self.assertNotIn(u'a', 'bdb')
    538         self.assertIn(u'a', 'bdba')
    539         self.assertIn(u'a', ('a',1,None))
    540         self.assertIn(u'a', (1,None,'a'))
    541         self.assertIn(u'a', (1,None,u'a'))
    542         self.assertIn('a', ('a',1,None))
    543         self.assertIn('a', (1,None,'a'))
    544         self.assertIn('a', (1,None,u'a'))
    545         self.assertNotIn('a', ('x',1,u'y'))
    546         self.assertNotIn('a', ('x',1,None))
    547         self.assertNotIn(u'abcd', u'abcxxxx')
    548         self.assertIn(u'ab', u'abcd')
    549         self.assertIn('ab', u'abc')
    550         self.assertIn(u'ab', 'abc')
    551         self.assertIn(u'ab', (1,None,u'ab'))
    552         self.assertIn(u'', u'abc')
    553         self.assertIn('', u'abc')
    554 
    555         # If the following fails either
    556         # the contains operator does not propagate UnicodeErrors or
    557         # someone has changed the default encoding
    558         self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
    559         self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
    560 
    561         self.assertIn(u'', '')
    562         self.assertIn('', u'')
    563         self.assertIn(u'', u'')
    564         self.assertIn(u'', 'abc')
    565         self.assertIn('', u'abc')
    566         self.assertIn(u'', u'abc')
    567         self.assertNotIn(u'\0', 'abc')
    568         self.assertNotIn('\0', u'abc')
    569         self.assertNotIn(u'\0', u'abc')
    570         self.assertIn(u'\0', '\0abc')
    571         self.assertIn('\0', u'\0abc')
    572         self.assertIn(u'\0', u'\0abc')
    573         self.assertIn(u'\0', 'abc\0')
    574         self.assertIn('\0', u'abc\0')
    575         self.assertIn(u'\0', u'abc\0')
    576         self.assertIn(u'a', '\0abc')
    577         self.assertIn('a', u'\0abc')
    578         self.assertIn(u'a', u'\0abc')
    579         self.assertIn(u'asdf', 'asdf')
    580         self.assertIn('asdf', u'asdf')
    581         self.assertIn(u'asdf', u'asdf')
    582         self.assertNotIn(u'asdf', 'asd')
    583         self.assertNotIn('asdf', u'asd')
    584         self.assertNotIn(u'asdf', u'asd')
    585         self.assertNotIn(u'asdf', '')
    586         self.assertNotIn('asdf', u'')
    587         self.assertNotIn(u'asdf', u'')
    588 
    589         self.assertRaises(TypeError, u"abc".__contains__)
    590         self.assertRaises(TypeError, u"abc".__contains__, object())
    591 
    592     def test_formatting(self):
    593         string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
    594         # Testing Unicode formatting strings...
    595         self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
    596         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
    597         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
    598         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
    599         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
    600         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
    601         if not sys.platform.startswith('java'):
    602             self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
    603         self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
    604         self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
    605 
    606         self.assertEqual(u'%c' % 0x1234, u'\u1234')
    607         self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
    608         self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
    609 
    610         for num in range(0x00,0x80):
    611             char = chr(num)
    612             self.assertEqual(u"%c" % char, unicode(char))
    613             self.assertEqual(u"%c" % num, unicode(char))
    614             self.assertTrue(char == u"%c" % char)
    615             self.assertTrue(char == u"%c" % num)
    616         # Issue 7649
    617         for num in range(0x80,0x100):
    618             uchar = unichr(num)
    619             self.assertEqual(uchar, u"%c" % num)   # works only with ints
    620             self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
    621             # the implicit decoding should fail for non-ascii chars
    622             self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
    623             self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
    624 
    625         # formatting jobs delegated from the string implementation:
    626         self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
    627         self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
    628         self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
    629         self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
    630         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
    631         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
    632         self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
    633         self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
    634         self.assertEqual('...%s...' % u"abc", u'...abc...')
    635         self.assertEqual('%*s' % (5,u'abc',), u'  abc')
    636         self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
    637         self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
    638         self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
    639         self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
    640         self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
    641         self.assertEqual('%c' % u'a', u'a')
    642         class Wrapper:
    643             def __str__(self):
    644                 return u'\u1234'
    645         self.assertEqual('%s' % Wrapper(), u'\u1234')
    646 
    647     @test_support.cpython_only
    648     def test_formatting_huge_precision(self):
    649         from _testcapi import INT_MAX
    650         format_string = u"%.{}f".format(INT_MAX + 1)
    651         with self.assertRaises(ValueError):
    652             result = format_string % 2.34
    653 
    654     def test_formatting_huge_width(self):
    655         format_string = u"%{}f".format(sys.maxsize + 1)
    656         with self.assertRaises(ValueError):
    657             result = format_string % 2.34
    658 
    659     def test_startswith_endswith_errors(self):
    660         for meth in (u'foo'.startswith, u'foo'.endswith):
    661             with self.assertRaises(UnicodeDecodeError):
    662                 meth('\xff')
    663             with self.assertRaises(TypeError) as cm:
    664                 meth(['f'])
    665             exc = str(cm.exception)
    666             self.assertIn('unicode', exc)
    667             self.assertIn('str', exc)
    668             self.assertIn('tuple', exc)
    669 
    670     @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
    671     def test_format_float(self):
    672         # should not format with a comma, but always with C locale
    673         self.assertEqual(u'1.0', u'%.1f' % 1.0)
    674 
    675     def test_constructor(self):
    676         # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
    677 
    678         self.assertEqual(
    679             unicode(u'unicode remains unicode'),
    680             u'unicode remains unicode'
    681         )
    682 
    683         class UnicodeSubclass(unicode):
    684             pass
    685 
    686         self.assertEqual(
    687             unicode(UnicodeSubclass('unicode subclass becomes unicode')),
    688             u'unicode subclass becomes unicode'
    689         )
    690 
    691         self.assertEqual(
    692             unicode('strings are converted to unicode'),
    693             u'strings are converted to unicode'
    694         )
    695 
    696         class UnicodeCompat:
    697             def __init__(self, x):
    698                 self.x = x
    699             def __unicode__(self):
    700                 return self.x
    701 
    702         self.assertEqual(
    703             unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
    704             u'__unicode__ compatible objects are recognized')
    705 
    706         class StringCompat:
    707             def __init__(self, x):
    708                 self.x = x
    709             def __str__(self):
    710                 return self.x
    711 
    712         self.assertEqual(
    713             unicode(StringCompat('__str__ compatible objects are recognized')),
    714             u'__str__ compatible objects are recognized'
    715         )
    716 
    717         # unicode(obj) is compatible to str():
    718 
    719         o = StringCompat('unicode(obj) is compatible to str()')
    720         self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
    721         self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
    722 
    723         # %-formatting and .__unicode__()
    724         self.assertEqual(u'%s' %
    725                          UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
    726                          u"u'%s' % obj uses obj.__unicode__()")
    727         self.assertEqual(u'%s' %
    728                          UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
    729                          u"u'%s' % obj falls back to obj.__str__()")
    730 
    731         for obj in (123, 123.45, 123L):
    732             self.assertEqual(unicode(obj), unicode(str(obj)))
    733 
    734         # unicode(obj, encoding, error) tests (this maps to
    735         # PyUnicode_FromEncodedObject() at C level)
    736 
    737         if not sys.platform.startswith('java'):
    738             self.assertRaises(
    739                 TypeError,
    740                 unicode,
    741                 u'decoding unicode is not supported',
    742                 'utf-8',
    743                 'strict'
    744             )
    745 
    746         self.assertEqual(
    747             unicode('strings are decoded to unicode', 'utf-8', 'strict'),
    748             u'strings are decoded to unicode'
    749         )
    750 
    751         if not sys.platform.startswith('java'):
    752             with test_support.check_py3k_warnings():
    753                 buf = buffer('character buffers are decoded to unicode')
    754             self.assertEqual(
    755                 unicode(
    756                     buf,
    757                     'utf-8',
    758                     'strict'
    759                 ),
    760                 u'character buffers are decoded to unicode'
    761             )
    762 
    763         self.assertRaises(TypeError, unicode, 42, 42, 42)
    764 
    765     def test_codecs_utf7(self):
    766         utfTests = [
    767             (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
    768             (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
    769             (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
    770             (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
    771             (u'+', '+-'),
    772             (u'+-', '+--'),
    773             (u'+?', '+-?'),
    774             (u'\?', '+AFw?'),
    775             (u'+?', '+-?'),
    776             (ur'\\?', '+AFwAXA?'),
    777             (ur'\\\?', '+AFwAXABc?'),
    778             (ur'++--', '+-+---'),
    779             (u'\U000abcde', '+2m/c3g-'),                  # surrogate pairs
    780             (u'/', '/'),
    781         ]
    782 
    783         for (x, y) in utfTests:
    784             self.assertEqual(x.encode('utf-7'), y)
    785 
    786         # Unpaired surrogates are passed through
    787         self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
    788         self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
    789         self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
    790         self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
    791         self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
    792         self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
    793         self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
    794         self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
    795 
    796         self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
    797         self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
    798 
    799         # Direct encoded characters
    800         set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
    801         # Optional direct characters
    802         set_o = '!"#$%&*;<=>@[]^_`{|}'
    803         for c in set_d:
    804             self.assertEqual(c.encode('utf7'), c.encode('ascii'))
    805             self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
    806             self.assertTrue(c == c.encode('ascii').decode('utf7'))
    807         for c in set_o:
    808             self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
    809             self.assertTrue(c == c.encode('ascii').decode('utf7'))
    810 
    811     def test_codecs_utf8(self):
    812         self.assertEqual(u''.encode('utf-8'), '')
    813         self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
    814         self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
    815         self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
    816         self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
    817         self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
    818         self.assertEqual(
    819             (u'\ud800\udc02'*1000).encode('utf-8'),
    820             '\xf0\x90\x80\x82'*1000
    821         )
    822         self.assertEqual(
    823             u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
    824             u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
    825             u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
    826             u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
    827             u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
    828             u' Nunstuck git und'.encode('utf-8'),
    829             '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
    830             '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
    831             '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
    832             '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
    833             '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
    834             '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
    835             '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
    836             '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
    837             '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
    838             '\xe3\x80\x8cWenn ist das Nunstuck git und'
    839         )
    840 
    841         # UTF-8 specific decoding tests
    842         self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
    843         self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
    844         self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
    845 
    846         # Other possible utf-8 test cases:
    847         # * strict decoding testing for all of the
    848         #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
    849 
    850     def test_utf8_decode_valid_sequences(self):
    851         sequences = [
    852             # single byte
    853             ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
    854             # 2 bytes
    855             ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
    856             # 3 bytes
    857             ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
    858             ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
    859             # 4 bytes
    860             ('\xF0\x90\x80\x80', u'\U00010000'),
    861             ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
    862         ]
    863         for seq, res in sequences:
    864             self.assertEqual(seq.decode('utf-8'), res)
    865 
    866         for ch in map(unichr, range(0, sys.maxunicode)):
    867             self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
    868 
    869     def test_utf8_decode_invalid_sequences(self):
    870         # continuation bytes in a sequence of 2, 3, or 4 bytes
    871         continuation_bytes = map(chr, range(0x80, 0xC0))
    872         # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
    873         invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
    874         # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
    875         invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
    876         invalid_start_bytes = (
    877             continuation_bytes + invalid_2B_seq_start_bytes +
    878             invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
    879         )
    880 
    881         for byte in invalid_start_bytes:
    882             self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
    883 
    884         for sb in invalid_2B_seq_start_bytes:
    885             for cb in continuation_bytes:
    886                 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
    887 
    888         for sb in invalid_4B_seq_start_bytes:
    889             for cb1 in continuation_bytes[:3]:
    890                 for cb3 in continuation_bytes[:3]:
    891                     self.assertRaises(UnicodeDecodeError,
    892                                       (sb+cb1+'\x80'+cb3).decode, 'utf-8')
    893 
    894         for cb in map(chr, range(0x80, 0xA0)):
    895             self.assertRaises(UnicodeDecodeError,
    896                               ('\xE0'+cb+'\x80').decode, 'utf-8')
    897             self.assertRaises(UnicodeDecodeError,
    898                               ('\xE0'+cb+'\xBF').decode, 'utf-8')
    899         # XXX: surrogates shouldn't be valid UTF-8!
    900         # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
    901         # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
    902         #for cb in map(chr, range(0xA0, 0xC0)):
    903             #self.assertRaises(UnicodeDecodeError,
    904                               #('\xED'+cb+'\x80').decode, 'utf-8')
    905             #self.assertRaises(UnicodeDecodeError,
    906                               #('\xED'+cb+'\xBF').decode, 'utf-8')
    907         # but since they are valid on Python 2 add a test for that:
    908         for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
    909                                  map(unichr, range(0xd800, 0xe000, 64))):
    910             encoded = '\xED'+cb+'\x80'
    911             self.assertEqual(encoded.decode('utf-8'), surrogate)
    912             self.assertEqual(surrogate.encode('utf-8'), encoded)
    913 
    914         for cb in map(chr, range(0x80, 0x90)):
    915             self.assertRaises(UnicodeDecodeError,
    916                               ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
    917             self.assertRaises(UnicodeDecodeError,
    918                               ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
    919         for cb in map(chr, range(0x90, 0xC0)):
    920             self.assertRaises(UnicodeDecodeError,
    921                               ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
    922             self.assertRaises(UnicodeDecodeError,
    923                               ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
    924 
    925     def test_issue8271(self):
    926         # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
    927         # only the start byte and the continuation byte(s) are now considered
    928         # invalid, instead of the number of bytes specified by the start byte.
    929         # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
    930         # table 3-8, Row 2) for more information about the algorithm used.
    931         FFFD = u'\ufffd'
    932         sequences = [
    933             # invalid start bytes
    934             ('\x80', FFFD), # continuation byte
    935             ('\x80\x80', FFFD*2), # 2 continuation bytes
    936             ('\xc0', FFFD),
    937             ('\xc0\xc0', FFFD*2),
    938             ('\xc1', FFFD),
    939             ('\xc1\xc0', FFFD*2),
    940             ('\xc0\xc1', FFFD*2),
    941             # with start byte of a 2-byte sequence
    942             ('\xc2', FFFD), # only the start byte
    943             ('\xc2\xc2', FFFD*2), # 2 start bytes
    944             ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
    945             ('\xc2\x41', FFFD+'A'), # invalid continuation byte
    946             # with start byte of a 3-byte sequence
    947             ('\xe1', FFFD), # only the start byte
    948             ('\xe1\xe1', FFFD*2), # 2 start bytes
    949             ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
    950             ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
    951             ('\xe1\x80', FFFD), # only 1 continuation byte
    952             ('\xe1\x41', FFFD+'A'), # invalid continuation byte
    953             ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
    954             ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
    955             ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
    956             ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
    957             ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
    958             # with start byte of a 4-byte sequence
    959             ('\xf1', FFFD), # only the start byte
    960             ('\xf1\xf1', FFFD*2), # 2 start bytes
    961             ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
    962             ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
    963             ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
    964             ('\xf1\x80', FFFD), # only 1 continuation bytes
    965             ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
    966             ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
    967             ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
    968             ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
    969             ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
    970             ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
    971             ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
    972             ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
    973             ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
    974             ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
    975             ('\xf1\xf1\x80\x41', FFFD*2+'A'),
    976             ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
    977             # with invalid start byte of a 4-byte sequence (rfc2279)
    978             ('\xf5', FFFD), # only the start byte
    979             ('\xf5\xf5', FFFD*2), # 2 start bytes
    980             ('\xf5\x80', FFFD*2), # only 1 continuation byte
    981             ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
    982             ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
    983             ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
    984             ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
    985             ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
    986             # with invalid start byte of a 5-byte sequence (rfc2279)
    987             ('\xf8', FFFD), # only the start byte
    988             ('\xf8\xf8', FFFD*2), # 2 start bytes
    989             ('\xf8\x80', FFFD*2), # only one continuation byte
    990             ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
    991             ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
    992             # with invalid start byte of a 6-byte sequence (rfc2279)
    993             ('\xfc', FFFD), # only the start byte
    994             ('\xfc\xfc', FFFD*2), # 2 start bytes
    995             ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
    996             ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
    997             # invalid start byte
    998             ('\xfe', FFFD),
    999             ('\xfe\x80\x80', FFFD*3),
   1000             # other sequences
   1001             ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
   1002             ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
   1003             ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
   1004             ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
   1005              u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
   1006         ]
   1007         for n, (seq, res) in enumerate(sequences):
   1008             self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
   1009             self.assertEqual(seq.decode('utf-8', 'replace'), res)
   1010             self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
   1011             self.assertEqual(seq.decode('utf-8', 'ignore'),
   1012                              res.replace(u'\uFFFD', ''))
   1013 
   1014     def test_codecs_idna(self):
   1015         # Test whether trailing dot is preserved
   1016         self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
   1017 
   1018     def test_codecs_errors(self):
   1019         # Error handling (encoding)
   1020         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
   1021         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
   1022         self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
   1023         self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
   1024         self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
   1025                          u'Andr\202 x'.encode('ascii', errors='replace'))
   1026         self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
   1027                          u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
   1028 
   1029         # Error handling (decoding)
   1030         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
   1031         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
   1032         self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
   1033         self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
   1034         self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
   1035                          u'abcde'.decode('ascii', errors='ignore'))
   1036         self.assertEqual(u'abcde'.decode('ascii', 'replace'),
   1037                          u'abcde'.decode(encoding='ascii', errors='replace'))
   1038 
   1039         # Error handling (unknown character names)
   1040         self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
   1041 
   1042         # Error handling (truncated escape sequence)
   1043         self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
   1044 
   1045         self.assertRaises(TypeError, "hello".decode, "test.unicode1")
   1046         self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
   1047         self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
   1048         self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
   1049         # executes PyUnicode_Encode()
   1050         import imp
   1051         self.assertRaises(
   1052             ImportError,
   1053             imp.find_module,
   1054             "non-existing module",
   1055             [u"non-existing dir"]
   1056         )
   1057 
   1058         # Error handling (wrong arguments)
   1059         self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
   1060 
   1061         # Error handling (PyUnicode_EncodeDecimal())
   1062         self.assertRaises(UnicodeError, int, u"\u0200")
   1063 
   1064     def test_codecs(self):
   1065         # Encoding
   1066         self.assertEqual(u'hello'.encode('ascii'), 'hello')
   1067         self.assertEqual(u'hello'.encode('utf-7'), 'hello')
   1068         self.assertEqual(u'hello'.encode('utf-8'), 'hello')
   1069         self.assertEqual(u'hello'.encode('utf8'), 'hello')
   1070         self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
   1071         self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
   1072         self.assertEqual(u'hello'.encode('latin-1'), 'hello')
   1073 
   1074         # Roundtrip safety for BMP (just the first 1024 chars)
   1075         for c in xrange(1024):
   1076             u = unichr(c)
   1077             for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
   1078                              'utf-16-be', 'raw_unicode_escape',
   1079                              'unicode_escape', 'unicode_internal'):
   1080                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1081 
   1082         # Roundtrip safety for BMP (just the first 256 chars)
   1083         for c in xrange(256):
   1084             u = unichr(c)
   1085             for encoding in ('latin-1',):
   1086                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1087 
   1088         # Roundtrip safety for BMP (just the first 128 chars)
   1089         for c in xrange(128):
   1090             u = unichr(c)
   1091             for encoding in ('ascii',):
   1092                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1093 
   1094         # Roundtrip safety for non-BMP (just a few chars)
   1095         u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
   1096         for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
   1097                          #'raw_unicode_escape',
   1098                          'unicode_escape', 'unicode_internal'):
   1099             self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1100 
   1101         # UTF-8 must be roundtrip safe for all UCS-2 code points
   1102         # This excludes surrogates: in the full range, there would be
   1103         # a surrogate pair (\udbff\udc00), which gets converted back
   1104         # to a non-BMP character (\U0010fc00)
   1105         u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
   1106         for encoding in ('utf-8',):
   1107             self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1108 
   1109     def test_codecs_charmap(self):
   1110         # 0-127
   1111         s = ''.join(map(chr, xrange(128)))
   1112         for encoding in (
   1113             'cp037', 'cp1026',
   1114             'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
   1115             'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
   1116             'cp863', 'cp865', 'cp866',
   1117             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
   1118             'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
   1119             'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
   1120             'mac_cyrillic', 'mac_latin2',
   1121 
   1122             'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
   1123             'cp1256', 'cp1257', 'cp1258',
   1124             'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
   1125 
   1126             'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
   1127             'cp1006', 'iso8859_8',
   1128 
   1129             ### These have undefined mappings:
   1130             #'cp424',
   1131 
   1132             ### These fail the round-trip:
   1133             #'cp875'
   1134 
   1135             ):
   1136             self.assertEqual(unicode(s, encoding).encode(encoding), s)
   1137 
   1138         # 128-255
   1139         s = ''.join(map(chr, xrange(128, 256)))
   1140         for encoding in (
   1141             'cp037', 'cp1026',
   1142             'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
   1143             'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
   1144             'cp863', 'cp865', 'cp866',
   1145             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
   1146             'iso8859_2', 'iso8859_4', 'iso8859_5',
   1147             'iso8859_9', 'koi8_r', 'latin_1',
   1148             'mac_cyrillic', 'mac_latin2',
   1149 
   1150             ### These have undefined mappings:
   1151             #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
   1152             #'cp1256', 'cp1257', 'cp1258',
   1153             #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
   1154             #'iso8859_3', 'iso8859_6', 'iso8859_7',
   1155             #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
   1156 
   1157             ### These fail the round-trip:
   1158             #'cp1006', 'cp875', 'iso8859_8',
   1159 
   1160             ):
   1161             self.assertEqual(unicode(s, encoding).encode(encoding), s)
   1162 
   1163     def test_concatenation(self):
   1164         self.assertEqual((u"abc" u"def"), u"abcdef")
   1165         self.assertEqual(("abc" u"def"), u"abcdef")
   1166         self.assertEqual((u"abc" "def"), u"abcdef")
   1167         self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
   1168         self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
   1169 
   1170     def test_printing(self):
   1171         class BitBucket:
   1172             def write(self, text):
   1173                 pass
   1174 
   1175         out = BitBucket()
   1176         print >>out, u'abc'
   1177         print >>out, u'abc', u'def'
   1178         print >>out, u'abc', 'def'
   1179         print >>out, 'abc', u'def'
   1180         print >>out, u'abc\n'
   1181         print >>out, u'abc\n',
   1182         print >>out, u'abc\n',
   1183         print >>out, u'def\n'
   1184         print >>out, u'def\n'
   1185 
   1186     def test_ucs4(self):
   1187         x = u'\U00100000'
   1188         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
   1189         self.assertEqual(x, y)
   1190 
   1191         y = r'\U00100000'
   1192         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
   1193         self.assertEqual(x, y)
   1194         y = r'\U00010000'
   1195         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
   1196         self.assertEqual(x, y)
   1197 
   1198         try:
   1199             '\U11111111'.decode("raw-unicode-escape")
   1200         except UnicodeDecodeError as e:
   1201             self.assertEqual(e.start, 0)
   1202             self.assertEqual(e.end, 10)
   1203         else:
   1204             self.fail("Should have raised UnicodeDecodeError")
   1205 
   1206     def test_conversion(self):
   1207         # Make sure __unicode__() works properly
   1208         class Foo0:
   1209             def __str__(self):
   1210                 return "foo"
   1211 
   1212         class Foo1:
   1213             def __unicode__(self):
   1214                 return u"foo"
   1215 
   1216         class Foo2(object):
   1217             def __unicode__(self):
   1218                 return u"foo"
   1219 
   1220         class Foo3(object):
   1221             def __unicode__(self):
   1222                 return "foo"
   1223 
   1224         class Foo4(str):
   1225             def __unicode__(self):
   1226                 return "foo"
   1227 
   1228         class Foo5(unicode):
   1229             def __unicode__(self):
   1230                 return "foo"
   1231 
   1232         class Foo6(str):
   1233             def __str__(self):
   1234                 return "foos"
   1235 
   1236             def __unicode__(self):
   1237                 return u"foou"
   1238 
   1239         class Foo7(unicode):
   1240             def __str__(self):
   1241                 return "foos"
   1242             def __unicode__(self):
   1243                 return u"foou"
   1244 
   1245         class Foo8(unicode):
   1246             def __new__(cls, content=""):
   1247                 return unicode.__new__(cls, 2*content)
   1248             def __unicode__(self):
   1249                 return self
   1250 
   1251         class Foo9(unicode):
   1252             def __str__(self):
   1253                 return "string"
   1254             def __unicode__(self):
   1255                 return "not unicode"
   1256 
   1257         self.assertEqual(unicode(Foo0()), u"foo")
   1258         self.assertEqual(unicode(Foo1()), u"foo")
   1259         self.assertEqual(unicode(Foo2()), u"foo")
   1260         self.assertEqual(unicode(Foo3()), u"foo")
   1261         self.assertEqual(unicode(Foo4("bar")), u"foo")
   1262         self.assertEqual(unicode(Foo5("bar")), u"foo")
   1263         self.assertEqual(unicode(Foo6("bar")), u"foou")
   1264         self.assertEqual(unicode(Foo7("bar")), u"foou")
   1265         self.assertEqual(unicode(Foo8("foo")), u"foofoo")
   1266         self.assertEqual(str(Foo9("foo")), "string")
   1267         self.assertEqual(unicode(Foo9("foo")), u"not unicode")
   1268 
   1269     def test_unicode_repr(self):
   1270         class s1:
   1271             def __repr__(self):
   1272                 return '\\n'
   1273 
   1274         class s2:
   1275             def __repr__(self):
   1276                 return u'\\n'
   1277 
   1278         self.assertEqual(repr(s1()), '\\n')
   1279         self.assertEqual(repr(s2()), '\\n')
   1280 
   1281     def test_expandtabs_overflows_gracefully(self):
   1282         # This test only affects 32-bit platforms because expandtabs can only take
   1283         # an int as the max value, not a 64-bit C long.  If expandtabs is changed
   1284         # to take a 64-bit long, this test should apply to all platforms.
   1285         if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
   1286             return
   1287         self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
   1288 
   1289     def test__format__(self):
   1290         def test(value, format, expected):
   1291             # test both with and without the trailing 's'
   1292             self.assertEqual(value.__format__(format), expected)
   1293             self.assertEqual(value.__format__(format + u's'), expected)
   1294 
   1295         test(u'', u'', u'')
   1296         test(u'abc', u'', u'abc')
   1297         test(u'abc', u'.3', u'abc')
   1298         test(u'ab', u'.3', u'ab')
   1299         test(u'abcdef', u'.3', u'abc')
   1300         test(u'abcdef', u'.0', u'')
   1301         test(u'abc', u'3.3', u'abc')
   1302         test(u'abc', u'2.3', u'abc')
   1303         test(u'abc', u'2.2', u'ab')
   1304         test(u'abc', u'3.2', u'ab ')
   1305         test(u'result', u'x<0', u'result')
   1306         test(u'result', u'x<5', u'result')
   1307         test(u'result', u'x<6', u'result')
   1308         test(u'result', u'x<7', u'resultx')
   1309         test(u'result', u'x<8', u'resultxx')
   1310         test(u'result', u' <7', u'result ')
   1311         test(u'result', u'<7', u'result ')
   1312         test(u'result', u'>7', u' result')
   1313         test(u'result', u'>8', u'  result')
   1314         test(u'result', u'^8', u' result ')
   1315         test(u'result', u'^9', u' result  ')
   1316         test(u'result', u'^10', u'  result  ')
   1317         test(u'a', u'10000', u'a' + u' ' * 9999)
   1318         test(u'', u'10000', u' ' * 10000)
   1319         test(u'', u'10000000', u' ' * 10000000)
   1320 
   1321         # test mixing unicode and str
   1322         self.assertEqual(u'abc'.__format__('s'), u'abc')
   1323         self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
   1324 
   1325     def test_format(self):
   1326         self.assertEqual(u''.format(), u'')
   1327         self.assertEqual(u'a'.format(), u'a')
   1328         self.assertEqual(u'ab'.format(), u'ab')
   1329         self.assertEqual(u'a{{'.format(), u'a{')
   1330         self.assertEqual(u'a}}'.format(), u'a}')
   1331         self.assertEqual(u'{{b'.format(), u'{b')
   1332         self.assertEqual(u'}}b'.format(), u'}b')
   1333         self.assertEqual(u'a{{b'.format(), u'a{b')
   1334 
   1335         # examples from the PEP:
   1336         import datetime
   1337         self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
   1338         self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
   1339                          u"My name is Fred")
   1340         self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
   1341                          u"My name is Fred :-{}")
   1342 
   1343         # datetime.__format__ doesn't work with unicode
   1344         #d = datetime.date(2007, 8, 18)
   1345         #self.assertEqual("The year is {0.year}".format(d),
   1346         #                 "The year is 2007")
   1347 
   1348         # classes we'll use for testing
   1349         class C:
   1350             def __init__(self, x=100):
   1351                 self._x = x
   1352             def __format__(self, spec):
   1353                 return spec
   1354 
   1355         class D:
   1356             def __init__(self, x):
   1357                 self.x = x
   1358             def __format__(self, spec):
   1359                 return str(self.x)
   1360 
   1361         # class with __str__, but no __format__
   1362         class E:
   1363             def __init__(self, x):
   1364                 self.x = x
   1365             def __str__(self):
   1366                 return u'E(' + self.x + u')'
   1367 
   1368         # class with __repr__, but no __format__ or __str__
   1369         class F:
   1370             def __init__(self, x):
   1371                 self.x = x
   1372             def __repr__(self):
   1373                 return u'F(' + self.x + u')'
   1374 
   1375         # class with __format__ that forwards to string, for some format_spec's
   1376         class G:
   1377             def __init__(self, x):
   1378                 self.x = x
   1379             def __str__(self):
   1380                 return u"string is " + self.x
   1381             def __format__(self, format_spec):
   1382                 if format_spec == 'd':
   1383                     return u'G(' + self.x + u')'
   1384                 return object.__format__(self, format_spec)
   1385 
   1386         # class that returns a bad type from __format__
   1387         class H:
   1388             def __format__(self, format_spec):
   1389                 return 1.0
   1390 
   1391         class I(datetime.date):
   1392             def __format__(self, format_spec):
   1393                 return self.strftime(format_spec)
   1394 
   1395         class J(int):
   1396             def __format__(self, format_spec):
   1397                 return int.__format__(self * 2, format_spec)
   1398 
   1399 
   1400         self.assertEqual(u''.format(), u'')
   1401         self.assertEqual(u'abc'.format(), u'abc')
   1402         self.assertEqual(u'{0}'.format(u'abc'), u'abc')
   1403         self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
   1404         self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
   1405         self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
   1406         self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
   1407         self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
   1408         self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
   1409         self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
   1410         self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
   1411         self.assertEqual(u'{0}'.format(-15), u'-15')
   1412         self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
   1413         self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
   1414         self.assertEqual(u'{{'.format(), u'{')
   1415         self.assertEqual(u'}}'.format(), u'}')
   1416         self.assertEqual(u'{{}}'.format(), u'{}')
   1417         self.assertEqual(u'{{x}}'.format(), u'{x}')
   1418         self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
   1419         self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
   1420         self.assertEqual(u'}}{{'.format(), u'}{')
   1421         self.assertEqual(u'}}x{{'.format(), u'}x{')
   1422 
   1423         # weird field names
   1424         self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
   1425         self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
   1426         self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
   1427 
   1428         self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
   1429         self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
   1430         self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
   1431         self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
   1432         self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
   1433         self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
   1434         self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
   1435 
   1436         # strings
   1437         self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
   1438         self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
   1439         self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
   1440         self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
   1441         self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
   1442         self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
   1443         self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
   1444         self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
   1445         self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
   1446         self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
   1447         self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
   1448         self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
   1449         self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
   1450         self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
   1451         self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
   1452         self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
   1453         self.assertEqual(u'{0:>8s}'.format(u'result'), u'  result')
   1454         self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
   1455         self.assertEqual(u'{0:^9s}'.format(u'result'), u' result  ')
   1456         self.assertEqual(u'{0:^10s}'.format(u'result'), u'  result  ')
   1457         self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
   1458         self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
   1459         self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
   1460 
   1461         # format specifiers for user defined type
   1462         self.assertEqual(u'{0:abc}'.format(C()), u'abc')
   1463 
   1464         # !r and !s coercions
   1465         self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
   1466         self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
   1467         self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello          ')
   1468         self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello          ')
   1469         self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
   1470         self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
   1471         self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
   1472 
   1473         # test fallback to object.__format__
   1474         self.assertEqual(u'{0}'.format({}), u'{}')
   1475         self.assertEqual(u'{0}'.format([]), u'[]')
   1476         self.assertEqual(u'{0}'.format([1]), u'[1]')
   1477         self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
   1478         self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
   1479         self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
   1480 
   1481         msg = 'object.__format__ with a non-empty format string is deprecated'
   1482         with test_support.check_warnings((msg, PendingDeprecationWarning)):
   1483             self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data)  ')
   1484             self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data)  ')
   1485             self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
   1486 
   1487         self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
   1488                                                         month=8,
   1489                                                         day=27)),
   1490                          u"date: 2007-08-27")
   1491 
   1492         # test deriving from a builtin type and overriding __format__
   1493         self.assertEqual(u"{0}".format(J(10)), u"20")
   1494 
   1495 
   1496         # string format specifiers
   1497         self.assertEqual(u'{0:}'.format('a'), u'a')
   1498 
   1499         # computed format specifiers
   1500         self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
   1501         self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
   1502         self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
   1503         self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello     ')
   1504         self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello     ')
   1505 
   1506         # test various errors
   1507         self.assertRaises(ValueError, u'{'.format)
   1508         self.assertRaises(ValueError, u'}'.format)
   1509         self.assertRaises(ValueError, u'a{'.format)
   1510         self.assertRaises(ValueError, u'a}'.format)
   1511         self.assertRaises(ValueError, u'{a'.format)
   1512         self.assertRaises(ValueError, u'}a'.format)
   1513         self.assertRaises(IndexError, u'{0}'.format)
   1514         self.assertRaises(IndexError, u'{1}'.format, u'abc')
   1515         self.assertRaises(KeyError,   u'{x}'.format)
   1516         self.assertRaises(ValueError, u"}{".format)
   1517         self.assertRaises(ValueError, u"{".format)
   1518         self.assertRaises(ValueError, u"}".format)
   1519         self.assertRaises(ValueError, u"abc{0:{}".format)
   1520         self.assertRaises(ValueError, u"{0".format)
   1521         self.assertRaises(IndexError, u"{0.}".format)
   1522         self.assertRaises(ValueError, u"{0.}".format, 0)
   1523         self.assertRaises(IndexError, u"{0[}".format)
   1524         self.assertRaises(ValueError, u"{0[}".format, [])
   1525         self.assertRaises(KeyError,   u"{0]}".format)
   1526         self.assertRaises(ValueError, u"{0.[]}".format, 0)
   1527         self.assertRaises(ValueError, u"{0..foo}".format, 0)
   1528         self.assertRaises(ValueError, u"{0[0}".format, 0)
   1529         self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
   1530         self.assertRaises(KeyError,   u"{c]}".format)
   1531         self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
   1532         self.assertRaises(ValueError, u"{0}}".format, 0)
   1533         self.assertRaises(KeyError,   u"{foo}".format, bar=3)
   1534         self.assertRaises(ValueError, u"{0!x}".format, 3)
   1535         self.assertRaises(ValueError, u"{0!}".format, 0)
   1536         self.assertRaises(ValueError, u"{0!rs}".format, 0)
   1537         self.assertRaises(ValueError, u"{!}".format)
   1538         self.assertRaises(IndexError, u"{:}".format)
   1539         self.assertRaises(IndexError, u"{:s}".format)
   1540         self.assertRaises(IndexError, u"{}".format)
   1541         big = u"23098475029384702983476098230754973209482573"
   1542         self.assertRaises(ValueError, (u"{" + big + u"}").format)
   1543         self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
   1544 
   1545         # issue 6089
   1546         self.assertRaises(ValueError, u"{0[0]x}".format, [None])
   1547         self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
   1548 
   1549         # can't have a replacement on the field name portion
   1550         self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
   1551 
   1552         # exceed maximum recursion depth
   1553         self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
   1554         self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
   1555                           0, 1, 2, 3, 4, 5, 6, 7)
   1556 
   1557         # string format spec errors
   1558         self.assertRaises(ValueError, u"{0:-s}".format, u'')
   1559         self.assertRaises(ValueError, format, u"", u"-")
   1560         self.assertRaises(ValueError, u"{0:=s}".format, u'')
   1561 
   1562         # test combining string and unicode
   1563         self.assertEqual(u"foo{0}".format('bar'), u'foobar')
   1564         # This will try to convert the argument from unicode to str, which
   1565         #  will succeed
   1566         self.assertEqual("foo{0}".format(u'bar'), 'foobar')
   1567         # This will try to convert the argument from unicode to str, which
   1568         #  will fail
   1569         self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
   1570 
   1571     def test_format_huge_precision(self):
   1572         format_string = u".{}f".format(sys.maxsize + 1)
   1573         with self.assertRaises(ValueError):
   1574             result = format(2.34, format_string)
   1575 
   1576     def test_format_huge_width(self):
   1577         format_string = u"{}f".format(sys.maxsize + 1)
   1578         with self.assertRaises(ValueError):
   1579             result = format(2.34, format_string)
   1580 
   1581     def test_format_huge_item_number(self):
   1582         format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
   1583         with self.assertRaises(ValueError):
   1584             result = format_string.format(2.34)
   1585 
   1586     def test_format_auto_numbering(self):
   1587         class C:
   1588             def __init__(self, x=100):
   1589                 self._x = x
   1590             def __format__(self, spec):
   1591                 return spec
   1592 
   1593         self.assertEqual(u'{}'.format(10), u'10')
   1594         self.assertEqual(u'{:5}'.format('s'), u's    ')
   1595         self.assertEqual(u'{!r}'.format('s'), u"'s'")
   1596         self.assertEqual(u'{._x}'.format(C(10)), u'10')
   1597         self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
   1598         self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
   1599         self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
   1600 
   1601         self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a    x     b')
   1602         self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
   1603 
   1604         # can't mix and match numbering and auto-numbering
   1605         self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
   1606         self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
   1607         self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
   1608         self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
   1609 
   1610         # can mix and match auto-numbering and named
   1611         self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
   1612         self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
   1613         self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
   1614         self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
   1615 
   1616     def test_raiseMemError(self):
   1617         # Ensure that the freelist contains a consistent object, even
   1618         # when a string allocation fails with a MemoryError.
   1619         # This used to crash the interpreter,
   1620         # or leak references when the number was smaller.
   1621         charwidth = 4 if sys.maxunicode >= 0x10000 else 2
   1622         # Note: sys.maxsize is half of the actual max allocation because of
   1623         # the signedness of Py_ssize_t.
   1624         alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
   1625         self.assertRaises(MemoryError, alloc)
   1626         self.assertRaises(MemoryError, alloc)
   1627 
   1628     def test_format_subclass(self):
   1629         class U(unicode):
   1630             def __unicode__(self):
   1631                 return u'__unicode__ overridden'
   1632         u = U(u'xxx')
   1633         self.assertEqual("%s" % u, u'__unicode__ overridden')
   1634         self.assertEqual("{}".format(u), '__unicode__ overridden')
   1635 
   1636     def test_encode_decimal(self):
   1637         from _testcapi import unicode_encodedecimal
   1638         self.assertEqual(unicode_encodedecimal(u'123'),
   1639                          b'123')
   1640         self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
   1641                          b'3.14')
   1642         self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
   1643                          b' 3.14 ')
   1644         self.assertRaises(UnicodeEncodeError,
   1645                           unicode_encodedecimal, u"123\u20ac", "strict")
   1646         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
   1647                          b'123?')
   1648         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
   1649                          b'123')
   1650         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
   1651                          b'123&#8364;')
   1652         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
   1653                          b'123\\u20ac')
   1654         self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
   1655                          b'123? ')
   1656         self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
   1657                          b'123??')
   1658         self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
   1659                          b'123?0')
   1660 
   1661 
   1662 def test_main():
   1663     test_support.run_unittest(__name__)
   1664 
   1665 if __name__ == "__main__":
   1666     test_main()
   1667