Home | History | Annotate | Download | only in test
      1 """ Test script for the Unicode implementation.
      2 
      3 Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      4 
      5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      6 
      7 """#"
      8 import sys
      9 import struct
     10 import codecs
     11 import unittest
     12 from test import test_support, string_tests
     13 
     14 # decorator to skip tests on narrow builds
     15 requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
     16                                       'requires wide build')
     17 
     18 # Error handling (bad decoder return)
     19 def search_function(encoding):
     20     def decode1(input, errors="strict"):
     21         return 42 # not a tuple
     22     def encode1(input, errors="strict"):
     23         return 42 # not a tuple
     24     def encode2(input, errors="strict"):
     25         return (42, 42) # no unicode
     26     def decode2(input, errors="strict"):
     27         return (42, 42) # no unicode
     28     if encoding=="test.unicode1":
     29         return (encode1, decode1, None, None)
     30     elif encoding=="test.unicode2":
     31         return (encode2, decode2, None, None)
     32     else:
     33         return None
     34 codecs.register(search_function)
     35 
     36 class UnicodeSubclass(unicode):
     37     pass
     38 
     39 class UnicodeTest(
     40     string_tests.CommonTest,
     41     string_tests.MixinStrUnicodeUserStringTest,
     42     string_tests.MixinStrUnicodeTest,
     43     ):
     44     type2test = unicode
     45 
     46     def assertEqual(self, first, second, msg=None):
     47         # strict assertEqual method: reject implicit bytes/unicode equality
     48         super(UnicodeTest, self).assertEqual(first, second, msg)
     49         if isinstance(first, unicode) or isinstance(second, unicode):
     50             self.assertIsInstance(first, unicode)
     51             self.assertIsInstance(second, unicode)
     52         elif isinstance(first, str) or isinstance(second, str):
     53             self.assertIsInstance(first, str)
     54             self.assertIsInstance(second, str)
     55 
     56     def checkequalnofix(self, result, object, methodname, *args):
     57         method = getattr(object, methodname)
     58         realresult = method(*args)
     59         self.assertEqual(realresult, result)
     60         self.assertTrue(type(realresult) is type(result))
     61 
     62         # if the original is returned make sure that
     63         # this doesn't happen with subclasses
     64         if realresult is object:
     65             class usub(unicode):
     66                 def __repr__(self):
     67                     return 'usub(%r)' % unicode.__repr__(self)
     68             object = usub(object)
     69             method = getattr(object, methodname)
     70             realresult = method(*args)
     71             self.assertEqual(realresult, result)
     72             self.assertTrue(object is not realresult)
     73 
     74     def test_literals(self):
     75         self.assertEqual(u'\xff', u'\u00ff')
     76         self.assertEqual(u'\uffff', u'\U0000ffff')
     77         self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
     78         self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
     79         self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
     80 
     81     def test_repr(self):
     82         if not sys.platform.startswith('java'):
     83             # Test basic sanity of repr()
     84             self.assertEqual(repr(u'abc'), "u'abc'")
     85             self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
     86             self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
     87             self.assertEqual(repr(u'\\c'), "u'\\\\c'")
     88             self.assertEqual(repr(u'\\'), "u'\\\\'")
     89             self.assertEqual(repr(u'\n'), "u'\\n'")
     90             self.assertEqual(repr(u'\r'), "u'\\r'")
     91             self.assertEqual(repr(u'\t'), "u'\\t'")
     92             self.assertEqual(repr(u'\b'), "u'\\x08'")
     93             self.assertEqual(repr(u"'\""), """u'\\'"'""")
     94             self.assertEqual(repr(u"'\""), """u'\\'"'""")
     95             self.assertEqual(repr(u"'"), '''u"'"''')
     96             self.assertEqual(repr(u'"'), """u'"'""")
     97             latin1repr = (
     98                 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
     99                 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
    100                 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
    101                 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
    102                 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
    103                 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
    104                 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
    105                 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
    106                 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
    107                 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
    108                 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
    109                 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
    110                 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
    111                 "\\xfe\\xff'")
    112             testrepr = repr(u''.join(map(unichr, xrange(256))))
    113             self.assertEqual(testrepr, latin1repr)
    114             # Test repr works on wide unicode escapes without overflow.
    115             self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
    116                              repr(u"\U00010000" * 39 + u"\uffff" * 4096))
    117 
    118 
    119     def test_count(self):
    120         string_tests.CommonTest.test_count(self)
    121         # check mixed argument types
    122         self.checkequalnofix(3,  'aaa', 'count', u'a')
    123         self.checkequalnofix(0,  'aaa', 'count', u'b')
    124         self.checkequalnofix(3, u'aaa', 'count',  'a')
    125         self.checkequalnofix(0, u'aaa', 'count',  'b')
    126         self.checkequalnofix(0, u'aaa', 'count',  'b')
    127         self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
    128         self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
    129         self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
    130         self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
    131 
    132     def test_find(self):
    133         self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
    134         self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
    135         self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
    136 
    137         self.assertRaises(TypeError, u'hello'.find)
    138         self.assertRaises(TypeError, u'hello'.find, 42)
    139 
    140     def test_rfind(self):
    141         string_tests.CommonTest.test_rfind(self)
    142         # check mixed argument types
    143         self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
    144         self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
    145         self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
    146 
    147     def test_index(self):
    148         string_tests.CommonTest.test_index(self)
    149         # check mixed argument types
    150         for (t1, t2) in ((str, unicode), (unicode, str)):
    151             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
    152             self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
    153             self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
    154             self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
    155             self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
    156             self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
    157             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
    158             self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
    159 
    160     def test_rindex(self):
    161         string_tests.CommonTest.test_rindex(self)
    162         # check mixed argument types
    163         for (t1, t2) in ((str, unicode), (unicode, str)):
    164             self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
    165             self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
    166             self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
    167             self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
    168 
    169             self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
    170             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
    171             self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
    172             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
    173             self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
    174 
    175     def test_translate(self):
    176         self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
    177         self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
    178         self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
    179         self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
    180         self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
    181         self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
    182 
    183         self.assertRaises(TypeError, u'hello'.translate)
    184         self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
    185 
    186     def test_split(self):
    187         string_tests.CommonTest.test_split(self)
    188 
    189         # Mixed arguments
    190         self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
    191         self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
    192         self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
    193 
    194     def test_join(self):
    195         string_tests.MixinStrUnicodeUserStringTest.test_join(self)
    196 
    197         # mixed arguments
    198         self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
    199         self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
    200         self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
    201         self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
    202         self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
    203         self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
    204         self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
    205 
    206     def test_strip(self):
    207         string_tests.CommonTest.test_strip(self)
    208         self.assertRaises(UnicodeError, u"hello".strip, "\xff")
    209 
    210     def test_replace(self):
    211         string_tests.CommonTest.test_replace(self)
    212 
    213         # method call forwarded from str implementation because of unicode argument
    214         self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
    215         self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
    216 
    217     def test_comparison(self):
    218         # Comparisons:
    219         self.assertTrue(u'abc' == 'abc')
    220         self.assertTrue('abc' == u'abc')
    221         self.assertTrue(u'abc' == u'abc')
    222         self.assertTrue(u'abcd' > 'abc')
    223         self.assertTrue('abcd' > u'abc')
    224         self.assertTrue(u'abcd' > u'abc')
    225         self.assertTrue(u'abc' < 'abcd')
    226         self.assertTrue('abc' < u'abcd')
    227         self.assertTrue(u'abc' < u'abcd')
    228 
    229         if 0:
    230             # Move these tests to a Unicode collation module test...
    231             # Testing UTF-16 code point order comparisons...
    232 
    233             # No surrogates, no fixup required.
    234             self.assertTrue(u'\u0061' < u'\u20ac')
    235             # Non surrogate below surrogate value, no fixup required
    236             self.assertTrue(u'\u0061' < u'\ud800\udc02')
    237 
    238             # Non surrogate above surrogate value, fixup required
    239             def test_lecmp(s, s2):
    240                 self.assertTrue(s < s2)
    241 
    242             def test_fixup(s):
    243                 s2 = u'\ud800\udc01'
    244                 test_lecmp(s, s2)
    245                 s2 = u'\ud900\udc01'
    246                 test_lecmp(s, s2)
    247                 s2 = u'\uda00\udc01'
    248                 test_lecmp(s, s2)
    249                 s2 = u'\udb00\udc01'
    250                 test_lecmp(s, s2)
    251                 s2 = u'\ud800\udd01'
    252                 test_lecmp(s, s2)
    253                 s2 = u'\ud900\udd01'
    254                 test_lecmp(s, s2)
    255                 s2 = u'\uda00\udd01'
    256                 test_lecmp(s, s2)
    257                 s2 = u'\udb00\udd01'
    258                 test_lecmp(s, s2)
    259                 s2 = u'\ud800\ude01'
    260                 test_lecmp(s, s2)
    261                 s2 = u'\ud900\ude01'
    262                 test_lecmp(s, s2)
    263                 s2 = u'\uda00\ude01'
    264                 test_lecmp(s, s2)
    265                 s2 = u'\udb00\ude01'
    266                 test_lecmp(s, s2)
    267                 s2 = u'\ud800\udfff'
    268                 test_lecmp(s, s2)
    269                 s2 = u'\ud900\udfff'
    270                 test_lecmp(s, s2)
    271                 s2 = u'\uda00\udfff'
    272                 test_lecmp(s, s2)
    273                 s2 = u'\udb00\udfff'
    274                 test_lecmp(s, s2)
    275 
    276                 test_fixup(u'\ue000')
    277                 test_fixup(u'\uff61')
    278 
    279         # Surrogates on both sides, no fixup required
    280         self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
    281 
    282     def test_capitalize(self):
    283         string_tests.CommonTest.test_capitalize(self)
    284         # check that titlecased chars are lowered correctly
    285         # \u1ffc is the titlecased char
    286         self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
    287                         u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
    288         # check with cased non-letter chars
    289         self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
    290                         u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
    291         self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
    292                         u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
    293         self.checkequal(u'\u2160\u2171\u2172',
    294                         u'\u2160\u2161\u2162', 'capitalize')
    295         self.checkequal(u'\u2160\u2171\u2172',
    296                         u'\u2170\u2171\u2172', 'capitalize')
    297         # check with Ll chars with no upper - nothing changes here
    298         self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
    299                         u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
    300 
    301     def test_islower(self):
    302         string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
    303         self.checkequalnofix(False, u'\u1FFc', 'islower')
    304 
    305     @requires_wide_build
    306     def test_islower_non_bmp(self):
    307         # non-BMP, uppercase
    308         self.assertFalse(u'\U00010401'.islower())
    309         self.assertFalse(u'\U00010427'.islower())
    310         # non-BMP, lowercase
    311         self.assertTrue(u'\U00010429'.islower())
    312         self.assertTrue(u'\U0001044E'.islower())
    313         # non-BMP, non-cased
    314         self.assertFalse(u'\U0001F40D'.islower())
    315         self.assertFalse(u'\U0001F46F'.islower())
    316 
    317     def test_isupper(self):
    318         string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
    319         if not sys.platform.startswith('java'):
    320             self.checkequalnofix(False, u'\u1FFc', 'isupper')
    321 
    322     @requires_wide_build
    323     def test_isupper_non_bmp(self):
    324         # non-BMP, uppercase
    325         self.assertTrue(u'\U00010401'.isupper())
    326         self.assertTrue(u'\U00010427'.isupper())
    327         # non-BMP, lowercase
    328         self.assertFalse(u'\U00010429'.isupper())
    329         self.assertFalse(u'\U0001044E'.isupper())
    330         # non-BMP, non-cased
    331         self.assertFalse(u'\U0001F40D'.isupper())
    332         self.assertFalse(u'\U0001F46F'.isupper())
    333 
    334     def test_istitle(self):
    335         string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
    336         self.checkequalnofix(True, u'\u1FFc', 'istitle')
    337         self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
    338 
    339     @requires_wide_build
    340     def test_istitle_non_bmp(self):
    341         # non-BMP, uppercase + lowercase
    342         self.assertTrue(u'\U00010401\U00010429'.istitle())
    343         self.assertTrue(u'\U00010427\U0001044E'.istitle())
    344         # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
    345         for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
    346             self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
    347 
    348     def test_isspace(self):
    349         string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
    350         self.checkequalnofix(True, u'\u2000', 'isspace')
    351         self.checkequalnofix(True, u'\u200a', 'isspace')
    352         self.checkequalnofix(False, u'\u2014', 'isspace')
    353 
    354     @requires_wide_build
    355     def test_isspace_non_bmp(self):
    356         # apparently there are no non-BMP spaces chars in Unicode 6
    357         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    358                    u'\U0001F40D', u'\U0001F46F']:
    359             self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
    360 
    361     @requires_wide_build
    362     def test_isalnum_non_bmp(self):
    363         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    364                    u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
    365             self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
    366 
    367     def test_isalpha(self):
    368         string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
    369         self.checkequalnofix(True, u'\u1FFc', 'isalpha')
    370 
    371     @requires_wide_build
    372     def test_isalpha_non_bmp(self):
    373         # non-BMP, cased
    374         self.assertTrue(u'\U00010401'.isalpha())
    375         self.assertTrue(u'\U00010427'.isalpha())
    376         self.assertTrue(u'\U00010429'.isalpha())
    377         self.assertTrue(u'\U0001044E'.isalpha())
    378         # non-BMP, non-cased
    379         self.assertFalse(u'\U0001F40D'.isalpha())
    380         self.assertFalse(u'\U0001F46F'.isalpha())
    381 
    382     def test_isdecimal(self):
    383         self.checkequalnofix(False, u'', 'isdecimal')
    384         self.checkequalnofix(False, u'a', 'isdecimal')
    385         self.checkequalnofix(True, u'0', 'isdecimal')
    386         self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
    387         self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
    388         self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
    389         self.checkequalnofix(True, u'0123456789', 'isdecimal')
    390         self.checkequalnofix(False, u'0123456789a', 'isdecimal')
    391 
    392         self.checkraises(TypeError, 'abc', 'isdecimal', 42)
    393 
    394     @requires_wide_build
    395     def test_isdecimal_non_bmp(self):
    396         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    397                    u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
    398             self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
    399         for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
    400             self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
    401 
    402     def test_isdigit(self):
    403         string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
    404         self.checkequalnofix(True, u'\u2460', 'isdigit')
    405         self.checkequalnofix(False, u'\xbc', 'isdigit')
    406         self.checkequalnofix(True, u'\u0660', 'isdigit')
    407 
    408     @requires_wide_build
    409     def test_isdigit_non_bmp(self):
    410         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    411                    u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
    412             self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
    413         for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
    414             self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
    415 
    416     def test_isnumeric(self):
    417         self.checkequalnofix(False, u'', 'isnumeric')
    418         self.checkequalnofix(False, u'a', 'isnumeric')
    419         self.checkequalnofix(True, u'0', 'isnumeric')
    420         self.checkequalnofix(True, u'\u2460', 'isnumeric')
    421         self.checkequalnofix(True, u'\xbc', 'isnumeric')
    422         self.checkequalnofix(True, u'\u0660', 'isnumeric')
    423         self.checkequalnofix(True, u'0123456789', 'isnumeric')
    424         self.checkequalnofix(False, u'0123456789a', 'isnumeric')
    425 
    426         self.assertRaises(TypeError, u"abc".isnumeric, 42)
    427 
    428     @requires_wide_build
    429     def test_isnumeric_non_bmp(self):
    430         for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
    431                    u'\U0001F40D', u'\U0001F46F']:
    432             self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
    433         for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
    434                    u'\U000104A0', u'\U0001F107']:
    435             self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
    436 
    437     @requires_wide_build
    438     def test_surrogates(self):
    439         # this test actually passes on narrow too, but it's just by accident.
    440         # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
    441         # uppercase as 'X X'
    442         for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
    443                   u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
    444             self.assertTrue(s.islower())
    445             self.assertFalse(s.isupper())
    446             self.assertFalse(s.istitle())
    447         for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
    448                   u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
    449             self.assertFalse(s.islower())
    450             self.assertTrue(s.isupper())
    451             self.assertTrue(s.istitle())
    452 
    453         for meth_name in ('islower', 'isupper', 'istitle'):
    454             meth = getattr(unicode, meth_name)
    455             for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
    456                 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
    457 
    458         for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
    459                           'isdecimal', 'isnumeric'):
    460             meth = getattr(unicode, meth_name)
    461             for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
    462                       u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
    463                       u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
    464                 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
    465 
    466 
    467     @requires_wide_build
    468     def test_lower(self):
    469         string_tests.CommonTest.test_lower(self)
    470         self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
    471         self.assertEqual(u'\U00010427\U00010427'.lower(),
    472                          u'\U0001044F\U0001044F')
    473         self.assertEqual(u'\U00010427\U0001044F'.lower(),
    474                          u'\U0001044F\U0001044F')
    475         self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
    476                          u'x\U0001044Fx\U0001044F')
    477 
    478     @requires_wide_build
    479     def test_upper(self):
    480         string_tests.CommonTest.test_upper(self)
    481         self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
    482         self.assertEqual(u'\U0001044F\U0001044F'.upper(),
    483                          u'\U00010427\U00010427')
    484         self.assertEqual(u'\U00010427\U0001044F'.upper(),
    485                          u'\U00010427\U00010427')
    486         self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
    487                          u'X\U00010427X\U00010427')
    488 
    489     @requires_wide_build
    490     def test_capitalize_wide_build(self):
    491         string_tests.CommonTest.test_capitalize(self)
    492         self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
    493         self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
    494                          u'\U00010427\U0001044F')
    495         self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
    496                          u'\U00010427\U0001044F')
    497         self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
    498                          u'\U00010427\U0001044F')
    499         self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
    500                          u'X\U0001044Fx\U0001044F')
    501 
    502     @requires_wide_build
    503     def test_title(self):
    504         string_tests.MixinStrUnicodeUserStringTest.test_title(self)
    505         self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
    506         self.assertEqual(u'\U0001044F\U0001044F'.title(),
    507                          u'\U00010427\U0001044F')
    508         self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
    509                          u'\U00010427\U0001044F \U00010427\U0001044F')
    510         self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
    511                          u'\U00010427\U0001044F \U00010427\U0001044F')
    512         self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
    513                          u'\U00010427\U0001044F \U00010427\U0001044F')
    514         self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
    515                          u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
    516 
    517     @requires_wide_build
    518     def test_swapcase(self):
    519         string_tests.CommonTest.test_swapcase(self)
    520         self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
    521         self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
    522         self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
    523                          u'\U00010427\U00010427')
    524         self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
    525                          u'\U0001044F\U00010427')
    526         self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
    527                          u'\U00010427\U0001044F')
    528         self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
    529                          u'x\U0001044FX\U00010427')
    530 
    531     def test_contains(self):
    532         # Testing Unicode contains method
    533         self.assertIn('a', u'abdb')
    534         self.assertIn('a', u'bdab')
    535         self.assertIn('a', u'bdaba')
    536         self.assertIn('a', u'bdba')
    537         self.assertIn('a', u'bdba')
    538         self.assertIn(u'a', u'bdba')
    539         self.assertNotIn(u'a', u'bdb')
    540         self.assertNotIn(u'a', 'bdb')
    541         self.assertIn(u'a', 'bdba')
    542         self.assertIn(u'a', ('a',1,None))
    543         self.assertIn(u'a', (1,None,'a'))
    544         self.assertIn(u'a', (1,None,u'a'))
    545         self.assertIn('a', ('a',1,None))
    546         self.assertIn('a', (1,None,'a'))
    547         self.assertIn('a', (1,None,u'a'))
    548         self.assertNotIn('a', ('x',1,u'y'))
    549         self.assertNotIn('a', ('x',1,None))
    550         self.assertNotIn(u'abcd', u'abcxxxx')
    551         self.assertIn(u'ab', u'abcd')
    552         self.assertIn('ab', u'abc')
    553         self.assertIn(u'ab', 'abc')
    554         self.assertIn(u'ab', (1,None,u'ab'))
    555         self.assertIn(u'', u'abc')
    556         self.assertIn('', u'abc')
    557 
    558         # If the following fails either
    559         # the contains operator does not propagate UnicodeErrors or
    560         # someone has changed the default encoding
    561         self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
    562         self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
    563 
    564         self.assertIn(u'', '')
    565         self.assertIn('', u'')
    566         self.assertIn(u'', u'')
    567         self.assertIn(u'', 'abc')
    568         self.assertIn('', u'abc')
    569         self.assertIn(u'', u'abc')
    570         self.assertNotIn(u'\0', 'abc')
    571         self.assertNotIn('\0', u'abc')
    572         self.assertNotIn(u'\0', u'abc')
    573         self.assertIn(u'\0', '\0abc')
    574         self.assertIn('\0', u'\0abc')
    575         self.assertIn(u'\0', u'\0abc')
    576         self.assertIn(u'\0', 'abc\0')
    577         self.assertIn('\0', u'abc\0')
    578         self.assertIn(u'\0', u'abc\0')
    579         self.assertIn(u'a', '\0abc')
    580         self.assertIn('a', u'\0abc')
    581         self.assertIn(u'a', u'\0abc')
    582         self.assertIn(u'asdf', 'asdf')
    583         self.assertIn('asdf', u'asdf')
    584         self.assertIn(u'asdf', u'asdf')
    585         self.assertNotIn(u'asdf', 'asd')
    586         self.assertNotIn('asdf', u'asd')
    587         self.assertNotIn(u'asdf', u'asd')
    588         self.assertNotIn(u'asdf', '')
    589         self.assertNotIn('asdf', u'')
    590         self.assertNotIn(u'asdf', u'')
    591 
    592         self.assertRaises(TypeError, u"abc".__contains__)
    593         self.assertRaises(TypeError, u"abc".__contains__, object())
    594 
    595     def test_formatting(self):
    596         string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
    597         # Testing Unicode formatting strings...
    598         self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
    599         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
    600         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
    601         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
    602         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
    603         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
    604         if not sys.platform.startswith('java'):
    605             self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
    606         self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
    607         self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
    608 
    609         self.assertEqual(u'%c' % 0x1234, u'\u1234')
    610         self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
    611         self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
    612 
    613         for num in range(0x00,0x80):
    614             char = chr(num)
    615             self.assertEqual(u"%c" % char, unicode(char))
    616             self.assertEqual(u"%c" % num, unicode(char))
    617             self.assertTrue(char == u"%c" % char)
    618             self.assertTrue(char == u"%c" % num)
    619         # Issue 7649
    620         for num in range(0x80,0x100):
    621             uchar = unichr(num)
    622             self.assertEqual(uchar, u"%c" % num)   # works only with ints
    623             self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
    624             # the implicit decoding should fail for non-ascii chars
    625             self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
    626             self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
    627 
    628         # formatting jobs delegated from the string implementation:
    629         self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
    630         self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
    631         self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
    632         self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
    633         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
    634         self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
    635         self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
    636         self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
    637         self.assertEqual('...%s...' % u"abc", u'...abc...')
    638         self.assertEqual('%*s' % (5,u'abc',), u'  abc')
    639         self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
    640         self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
    641         self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
    642         self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
    643         self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
    644         self.assertEqual('%c' % u'a', u'a')
    645         class Wrapper:
    646             def __str__(self):
    647                 return u'\u1234'
    648         self.assertEqual('%s' % Wrapper(), u'\u1234')
    649 
    650     def test_formatting_huge_precision(self):
    651         format_string = u"%.{}f".format(sys.maxsize + 1)
    652         with self.assertRaises(ValueError):
    653             result = format_string % 2.34
    654 
    655     @test_support.cpython_only
    656     def test_formatting_huge_precision_c_limits(self):
    657         from _testcapi import INT_MAX
    658         format_string = u"%.{}f".format(INT_MAX + 1)
    659         with self.assertRaises(ValueError):
    660             result = format_string % 2.34
    661 
    662     def test_formatting_huge_width(self):
    663         format_string = u"%{}f".format(sys.maxsize + 1)
    664         with self.assertRaises(ValueError):
    665             result = format_string % 2.34
    666 
    667     def test_startswith_endswith_errors(self):
    668         for meth in (u'foo'.startswith, u'foo'.endswith):
    669             with self.assertRaises(UnicodeDecodeError):
    670                 meth('\xff')
    671             with self.assertRaises(TypeError) as cm:
    672                 meth(['f'])
    673             exc = str(cm.exception)
    674             self.assertIn('unicode', exc)
    675             self.assertIn('str', exc)
    676             self.assertIn('tuple', exc)
    677 
    678     @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
    679     def test_format_float(self):
    680         # should not format with a comma, but always with C locale
    681         self.assertEqual(u'1.0', u'%.1f' % 1.0)
    682 
    683     def test_constructor(self):
    684         # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
    685 
    686         self.assertEqual(
    687             unicode(u'unicode remains unicode'),
    688             u'unicode remains unicode'
    689         )
    690 
    691         self.assertEqual(
    692             unicode(UnicodeSubclass('unicode subclass becomes unicode')),
    693             u'unicode subclass becomes unicode'
    694         )
    695 
    696         self.assertEqual(
    697             unicode('strings are converted to unicode'),
    698             u'strings are converted to unicode'
    699         )
    700 
    701         class UnicodeCompat:
    702             def __init__(self, x):
    703                 self.x = x
    704             def __unicode__(self):
    705                 return self.x
    706 
    707         self.assertEqual(
    708             unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
    709             u'__unicode__ compatible objects are recognized')
    710 
    711         class StringCompat:
    712             def __init__(self, x):
    713                 self.x = x
    714             def __str__(self):
    715                 return self.x
    716 
    717         self.assertEqual(
    718             unicode(StringCompat('__str__ compatible objects are recognized')),
    719             u'__str__ compatible objects are recognized'
    720         )
    721 
    722         # unicode(obj) is compatible to str():
    723 
    724         o = StringCompat('unicode(obj) is compatible to str()')
    725         self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
    726         self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
    727 
    728         # %-formatting and .__unicode__()
    729         self.assertEqual(u'%s' %
    730                          UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
    731                          u"u'%s' % obj uses obj.__unicode__()")
    732         self.assertEqual(u'%s' %
    733                          UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
    734                          u"u'%s' % obj falls back to obj.__str__()")
    735 
    736         for obj in (123, 123.45, 123L):
    737             self.assertEqual(unicode(obj), unicode(str(obj)))
    738 
    739         # unicode(obj, encoding, error) tests (this maps to
    740         # PyUnicode_FromEncodedObject() at C level)
    741 
    742         if not sys.platform.startswith('java'):
    743             self.assertRaises(
    744                 TypeError,
    745                 unicode,
    746                 u'decoding unicode is not supported',
    747                 'utf-8',
    748                 'strict'
    749             )
    750 
    751         self.assertEqual(
    752             unicode('strings are decoded to unicode', 'utf-8', 'strict'),
    753             u'strings are decoded to unicode'
    754         )
    755 
    756         if not sys.platform.startswith('java'):
    757             with test_support.check_py3k_warnings():
    758                 buf = buffer('character buffers are decoded to unicode')
    759             self.assertEqual(
    760                 unicode(
    761                     buf,
    762                     'utf-8',
    763                     'strict'
    764                 ),
    765                 u'character buffers are decoded to unicode'
    766             )
    767 
    768         self.assertRaises(TypeError, unicode, 42, 42, 42)
    769 
    770     def test_codecs_utf7(self):
    771         utfTests = [
    772             (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
    773             (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
    774             (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
    775             (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
    776             (u'+', '+-'),
    777             (u'+-', '+--'),
    778             (u'+?', '+-?'),
    779             (u'\?', '+AFw?'),
    780             (u'+?', '+-?'),
    781             (ur'\\?', '+AFwAXA?'),
    782             (ur'\\\?', '+AFwAXABc?'),
    783             (ur'++--', '+-+---'),
    784             (u'\U000abcde', '+2m/c3g-'),                  # surrogate pairs
    785             (u'/', '/'),
    786         ]
    787 
    788         for (x, y) in utfTests:
    789             self.assertEqual(x.encode('utf-7'), y)
    790 
    791         # Unpaired surrogates are passed through
    792         self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
    793         self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
    794         self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
    795         self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
    796         self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
    797         self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
    798         self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
    799         self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
    800 
    801         self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
    802         self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
    803 
    804         # Direct encoded characters
    805         set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
    806         # Optional direct characters
    807         set_o = '!"#$%&*;<=>@[]^_`{|}'
    808         for c in set_d:
    809             self.assertEqual(c.encode('utf7'), c.encode('ascii'))
    810             self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
    811             self.assertTrue(c == c.encode('ascii').decode('utf7'))
    812         for c in set_o:
    813             self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
    814             self.assertTrue(c == c.encode('ascii').decode('utf7'))
    815 
    816     def test_codecs_utf8(self):
    817         self.assertEqual(u''.encode('utf-8'), '')
    818         self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
    819         self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
    820         self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
    821         self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
    822         self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
    823         self.assertEqual(
    824             (u'\ud800\udc02'*1000).encode('utf-8'),
    825             '\xf0\x90\x80\x82'*1000
    826         )
    827         self.assertEqual(
    828             u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
    829             u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
    830             u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
    831             u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
    832             u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
    833             u' Nunstuck git und'.encode('utf-8'),
    834             '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
    835             '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
    836             '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
    837             '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
    838             '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
    839             '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
    840             '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
    841             '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
    842             '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
    843             '\xe3\x80\x8cWenn ist das Nunstuck git und'
    844         )
    845 
    846         # UTF-8 specific decoding tests
    847         self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
    848         self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
    849         self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
    850 
    851         # Other possible utf-8 test cases:
    852         # * strict decoding testing for all of the
    853         #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
    854 
    855     def test_utf8_decode_valid_sequences(self):
    856         sequences = [
    857             # single byte
    858             ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
    859             # 2 bytes
    860             ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
    861             # 3 bytes
    862             ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
    863             ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
    864             # 4 bytes
    865             ('\xF0\x90\x80\x80', u'\U00010000'),
    866             ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
    867         ]
    868         for seq, res in sequences:
    869             self.assertEqual(seq.decode('utf-8'), res)
    870 
    871         for ch in map(unichr, range(0, sys.maxunicode)):
    872             self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
    873 
    874     def test_utf8_decode_invalid_sequences(self):
    875         # continuation bytes in a sequence of 2, 3, or 4 bytes
    876         continuation_bytes = map(chr, range(0x80, 0xC0))
    877         # start bytes of a 2-byte sequence equivalent to code points < 0x7F
    878         invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
    879         # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
    880         invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
    881         invalid_start_bytes = (
    882             continuation_bytes + invalid_2B_seq_start_bytes +
    883             invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
    884         )
    885 
    886         for byte in invalid_start_bytes:
    887             self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
    888 
    889         for sb in invalid_2B_seq_start_bytes:
    890             for cb in continuation_bytes:
    891                 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
    892 
    893         for sb in invalid_4B_seq_start_bytes:
    894             for cb1 in continuation_bytes[:3]:
    895                 for cb3 in continuation_bytes[:3]:
    896                     self.assertRaises(UnicodeDecodeError,
    897                                       (sb+cb1+'\x80'+cb3).decode, 'utf-8')
    898 
    899         for cb in map(chr, range(0x80, 0xA0)):
    900             self.assertRaises(UnicodeDecodeError,
    901                               ('\xE0'+cb+'\x80').decode, 'utf-8')
    902             self.assertRaises(UnicodeDecodeError,
    903                               ('\xE0'+cb+'\xBF').decode, 'utf-8')
    904         # XXX: surrogates shouldn't be valid UTF-8!
    905         # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
    906         # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
    907         #for cb in map(chr, range(0xA0, 0xC0)):
    908             #self.assertRaises(UnicodeDecodeError,
    909                               #('\xED'+cb+'\x80').decode, 'utf-8')
    910             #self.assertRaises(UnicodeDecodeError,
    911                               #('\xED'+cb+'\xBF').decode, 'utf-8')
    912         # but since they are valid on Python 2 add a test for that:
    913         for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
    914                                  map(unichr, range(0xd800, 0xe000, 64))):
    915             encoded = '\xED'+cb+'\x80'
    916             self.assertEqual(encoded.decode('utf-8'), surrogate)
    917             self.assertEqual(surrogate.encode('utf-8'), encoded)
    918 
    919         for cb in map(chr, range(0x80, 0x90)):
    920             self.assertRaises(UnicodeDecodeError,
    921                               ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
    922             self.assertRaises(UnicodeDecodeError,
    923                               ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
    924         for cb in map(chr, range(0x90, 0xC0)):
    925             self.assertRaises(UnicodeDecodeError,
    926                               ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
    927             self.assertRaises(UnicodeDecodeError,
    928                               ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
    929 
    930     def test_issue8271(self):
    931         # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
    932         # only the start byte and the continuation byte(s) are now considered
    933         # invalid, instead of the number of bytes specified by the start byte.
    934         # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
    935         # table 3-8, Row 2) for more information about the algorithm used.
    936         FFFD = u'\ufffd'
    937         sequences = [
    938             # invalid start bytes
    939             ('\x80', FFFD), # continuation byte
    940             ('\x80\x80', FFFD*2), # 2 continuation bytes
    941             ('\xc0', FFFD),
    942             ('\xc0\xc0', FFFD*2),
    943             ('\xc1', FFFD),
    944             ('\xc1\xc0', FFFD*2),
    945             ('\xc0\xc1', FFFD*2),
    946             # with start byte of a 2-byte sequence
    947             ('\xc2', FFFD), # only the start byte
    948             ('\xc2\xc2', FFFD*2), # 2 start bytes
    949             ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
    950             ('\xc2\x41', FFFD+'A'), # invalid continuation byte
    951             # with start byte of a 3-byte sequence
    952             ('\xe1', FFFD), # only the start byte
    953             ('\xe1\xe1', FFFD*2), # 2 start bytes
    954             ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
    955             ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
    956             ('\xe1\x80', FFFD), # only 1 continuation byte
    957             ('\xe1\x41', FFFD+'A'), # invalid continuation byte
    958             ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
    959             ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
    960             ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
    961             ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
    962             ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
    963             # with start byte of a 4-byte sequence
    964             ('\xf1', FFFD), # only the start byte
    965             ('\xf1\xf1', FFFD*2), # 2 start bytes
    966             ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
    967             ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
    968             ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
    969             ('\xf1\x80', FFFD), # only 1 continuation bytes
    970             ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
    971             ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
    972             ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
    973             ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
    974             ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
    975             ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
    976             ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
    977             ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
    978             ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
    979             ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
    980             ('\xf1\xf1\x80\x41', FFFD*2+'A'),
    981             ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
    982             # with invalid start byte of a 4-byte sequence (rfc2279)
    983             ('\xf5', FFFD), # only the start byte
    984             ('\xf5\xf5', FFFD*2), # 2 start bytes
    985             ('\xf5\x80', FFFD*2), # only 1 continuation byte
    986             ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
    987             ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
    988             ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
    989             ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
    990             ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
    991             # with invalid start byte of a 5-byte sequence (rfc2279)
    992             ('\xf8', FFFD), # only the start byte
    993             ('\xf8\xf8', FFFD*2), # 2 start bytes
    994             ('\xf8\x80', FFFD*2), # only one continuation byte
    995             ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
    996             ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
    997             # with invalid start byte of a 6-byte sequence (rfc2279)
    998             ('\xfc', FFFD), # only the start byte
    999             ('\xfc\xfc', FFFD*2), # 2 start bytes
   1000             ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
   1001             ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
   1002             # invalid start byte
   1003             ('\xfe', FFFD),
   1004             ('\xfe\x80\x80', FFFD*3),
   1005             # other sequences
   1006             ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
   1007             ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
   1008             ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
   1009             ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
   1010              u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
   1011         ]
   1012         for n, (seq, res) in enumerate(sequences):
   1013             self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
   1014             self.assertEqual(seq.decode('utf-8', 'replace'), res)
   1015             self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
   1016             self.assertEqual(seq.decode('utf-8', 'ignore'),
   1017                              res.replace(u'\uFFFD', ''))
   1018 
   1019     def test_codecs_idna(self):
   1020         # Test whether trailing dot is preserved
   1021         self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
   1022 
   1023     def test_codecs_errors(self):
   1024         # Error handling (encoding)
   1025         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
   1026         self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
   1027         self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
   1028         self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
   1029         self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
   1030                          u'Andr\202 x'.encode('ascii', errors='replace'))
   1031         self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
   1032                          u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
   1033 
   1034         # Error handling (decoding)
   1035         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
   1036         self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
   1037         self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
   1038         self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
   1039         self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
   1040         with test_support.check_py3k_warnings():
   1041             self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
   1042                              u'abcde'.decode('ascii', errors='ignore'))
   1043         with test_support.check_py3k_warnings():
   1044             self.assertEqual(u'abcde'.decode('ascii', 'replace'),
   1045                              u'abcde'.decode(encoding='ascii', errors='replace'))
   1046 
   1047         # Error handling (unknown character names)
   1048         self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
   1049 
   1050         # Error handling (truncated escape sequence)
   1051         self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
   1052 
   1053         self.assertRaises(TypeError, "hello".decode, "test.unicode1")
   1054         self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
   1055         self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
   1056         self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
   1057         # executes PyUnicode_Encode()
   1058         import imp
   1059         self.assertRaises(
   1060             ImportError,
   1061             imp.find_module,
   1062             "non-existing module",
   1063             [u"non-existing dir"]
   1064         )
   1065 
   1066         # Error handling (wrong arguments)
   1067         self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
   1068 
   1069         # Error handling (PyUnicode_EncodeDecimal())
   1070         self.assertRaises(UnicodeError, int, u"\u0200")
   1071 
   1072     def test_codecs(self):
   1073         # Encoding
   1074         self.assertEqual(u'hello'.encode('ascii'), 'hello')
   1075         self.assertEqual(u'hello'.encode('utf-7'), 'hello')
   1076         self.assertEqual(u'hello'.encode('utf-8'), 'hello')
   1077         self.assertEqual(u'hello'.encode('utf8'), 'hello')
   1078         self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
   1079         self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
   1080         self.assertEqual(u'hello'.encode('latin-1'), 'hello')
   1081 
   1082         # Roundtrip safety for BMP (just the first 1024 chars)
   1083         for c in xrange(1024):
   1084             u = unichr(c)
   1085             for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
   1086                              'utf-16-be', 'raw_unicode_escape',
   1087                              'unicode_escape', 'unicode_internal'):
   1088                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1089 
   1090         # Roundtrip safety for BMP (just the first 256 chars)
   1091         for c in xrange(256):
   1092             u = unichr(c)
   1093             for encoding in ('latin-1',):
   1094                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1095 
   1096         # Roundtrip safety for BMP (just the first 128 chars)
   1097         for c in xrange(128):
   1098             u = unichr(c)
   1099             for encoding in ('ascii',):
   1100                 self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1101 
   1102         # Roundtrip safety for non-BMP (just a few chars)
   1103         u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
   1104         for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
   1105                          #'raw_unicode_escape',
   1106                          'unicode_escape', 'unicode_internal'):
   1107             self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1108 
   1109         # UTF-8 must be roundtrip safe for all UCS-2 code points
   1110         # This excludes surrogates: in the full range, there would be
   1111         # a surrogate pair (\udbff\udc00), which gets converted back
   1112         # to a non-BMP character (\U0010fc00)
   1113         u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
   1114         for encoding in ('utf-8',):
   1115             self.assertEqual(unicode(u.encode(encoding),encoding), u)
   1116 
   1117     def test_codecs_charmap(self):
   1118         # 0-127
   1119         s = ''.join(map(chr, xrange(128)))
   1120         for encoding in (
   1121             'cp037', 'cp1026',
   1122             'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
   1123             'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
   1124             'cp863', 'cp865', 'cp866',
   1125             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
   1126             'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
   1127             'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
   1128             'mac_cyrillic', 'mac_latin2',
   1129 
   1130             'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
   1131             'cp1256', 'cp1257', 'cp1258',
   1132             'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
   1133 
   1134             'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
   1135             'cp1006', 'iso8859_8',
   1136 
   1137             ### These have undefined mappings:
   1138             #'cp424',
   1139 
   1140             ### These fail the round-trip:
   1141             #'cp875'
   1142 
   1143             ):
   1144             self.assertEqual(unicode(s, encoding).encode(encoding), s)
   1145 
   1146         # 128-255
   1147         s = ''.join(map(chr, xrange(128, 256)))
   1148         for encoding in (
   1149             'cp037', 'cp1026',
   1150             'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
   1151             'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
   1152             'cp863', 'cp865', 'cp866',
   1153             'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
   1154             'iso8859_2', 'iso8859_4', 'iso8859_5',
   1155             'iso8859_9', 'koi8_r', 'latin_1',
   1156             'mac_cyrillic', 'mac_latin2',
   1157 
   1158             ### These have undefined mappings:
   1159             #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
   1160             #'cp1256', 'cp1257', 'cp1258',
   1161             #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
   1162             #'iso8859_3', 'iso8859_6', 'iso8859_7',
   1163             #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
   1164 
   1165             ### These fail the round-trip:
   1166             #'cp1006', 'cp875', 'iso8859_8',
   1167 
   1168             ):
   1169             self.assertEqual(unicode(s, encoding).encode(encoding), s)
   1170 
   1171     def test_concatenation(self):
   1172         self.assertEqual((u"abc" u"def"), u"abcdef")
   1173         self.assertEqual(("abc" u"def"), u"abcdef")
   1174         self.assertEqual((u"abc" "def"), u"abcdef")
   1175         self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
   1176         self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
   1177 
   1178     def test_printing(self):
   1179         class BitBucket:
   1180             def write(self, text):
   1181                 pass
   1182 
   1183         out = BitBucket()
   1184         print >>out, u'abc'
   1185         print >>out, u'abc', u'def'
   1186         print >>out, u'abc', 'def'
   1187         print >>out, 'abc', u'def'
   1188         print >>out, u'abc\n'
   1189         print >>out, u'abc\n',
   1190         print >>out, u'abc\n',
   1191         print >>out, u'def\n'
   1192         print >>out, u'def\n'
   1193 
   1194     def test_ucs4(self):
   1195         x = u'\U00100000'
   1196         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
   1197         self.assertEqual(x, y)
   1198 
   1199         y = r'\U00100000'
   1200         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
   1201         self.assertEqual(x, y)
   1202         y = r'\U00010000'
   1203         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
   1204         self.assertEqual(x, y)
   1205 
   1206         try:
   1207             '\U11111111'.decode("raw-unicode-escape")
   1208         except UnicodeDecodeError as e:
   1209             self.assertEqual(e.start, 0)
   1210             self.assertEqual(e.end, 10)
   1211         else:
   1212             self.fail("Should have raised UnicodeDecodeError")
   1213 
   1214     def test_conversion(self):
   1215         # Make sure __unicode__() works properly
   1216         class Foo0:
   1217             def __str__(self):
   1218                 return "foo"
   1219 
   1220         class Foo1:
   1221             def __unicode__(self):
   1222                 return u"foo"
   1223 
   1224         class Foo2(object):
   1225             def __unicode__(self):
   1226                 return u"foo"
   1227 
   1228         class Foo3(object):
   1229             def __unicode__(self):
   1230                 return "foo"
   1231 
   1232         class Foo4(str):
   1233             def __unicode__(self):
   1234                 return "foo"
   1235 
   1236         class Foo5(unicode):
   1237             def __unicode__(self):
   1238                 return "foo"
   1239 
   1240         class Foo6(str):
   1241             def __str__(self):
   1242                 return "foos"
   1243 
   1244             def __unicode__(self):
   1245                 return u"foou"
   1246 
   1247         class Foo7(unicode):
   1248             def __str__(self):
   1249                 return "foos"
   1250             def __unicode__(self):
   1251                 return u"foou"
   1252 
   1253         class Foo8(unicode):
   1254             def __new__(cls, content=""):
   1255                 return unicode.__new__(cls, 2*content)
   1256             def __unicode__(self):
   1257                 return self
   1258 
   1259         class Foo9(unicode):
   1260             def __str__(self):
   1261                 return "string"
   1262             def __unicode__(self):
   1263                 return "not unicode"
   1264 
   1265         self.assertEqual(unicode(Foo0()), u"foo")
   1266         self.assertEqual(unicode(Foo1()), u"foo")
   1267         self.assertEqual(unicode(Foo2()), u"foo")
   1268         self.assertEqual(unicode(Foo3()), u"foo")
   1269         self.assertEqual(unicode(Foo4("bar")), u"foo")
   1270         self.assertEqual(unicode(Foo5("bar")), u"foo")
   1271         self.assertEqual(unicode(Foo6("bar")), u"foou")
   1272         self.assertEqual(unicode(Foo7("bar")), u"foou")
   1273         self.assertEqual(unicode(Foo8("foo")), u"foofoo")
   1274         self.assertIs(type(unicode(Foo8("foo"))), Foo8)
   1275         self.assertEqual(UnicodeSubclass(Foo8("foo")), u"foofoo")
   1276         self.assertIs(type(UnicodeSubclass(Foo8("foo"))), UnicodeSubclass)
   1277         self.assertEqual(str(Foo9("foo")), "string")
   1278         self.assertEqual(unicode(Foo9("foo")), u"not unicode")
   1279 
   1280     def test_unicode_repr(self):
   1281         class s1:
   1282             def __repr__(self):
   1283                 return '\\n'
   1284 
   1285         class s2:
   1286             def __repr__(self):
   1287                 return u'\\n'
   1288 
   1289         self.assertEqual(repr(s1()), '\\n')
   1290         self.assertEqual(repr(s2()), '\\n')
   1291 
   1292     # This test only affects 32-bit platforms because expandtabs can only take
   1293     # an int as the max value, not a 64-bit C long.  If expandtabs is changed
   1294     # to take a 64-bit long, this test should apply to all platforms.
   1295     @unittest.skipIf(sys.maxint > (1 << 32) or struct.calcsize('P') != 4,
   1296                      'only applies to 32-bit platforms')
   1297     def test_expandtabs_overflows_gracefully(self):
   1298         self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
   1299 
   1300     def test__format__(self):
   1301         def test(value, format, expected):
   1302             # test both with and without the trailing 's'
   1303             self.assertEqual(value.__format__(format), expected)
   1304             self.assertEqual(value.__format__(format + u's'), expected)
   1305 
   1306         test(u'', u'', u'')
   1307         test(u'abc', u'', u'abc')
   1308         test(u'abc', u'.3', u'abc')
   1309         test(u'ab', u'.3', u'ab')
   1310         test(u'abcdef', u'.3', u'abc')
   1311         test(u'abcdef', u'.0', u'')
   1312         test(u'abc', u'3.3', u'abc')
   1313         test(u'abc', u'2.3', u'abc')
   1314         test(u'abc', u'2.2', u'ab')
   1315         test(u'abc', u'3.2', u'ab ')
   1316         test(u'result', u'x<0', u'result')
   1317         test(u'result', u'x<5', u'result')
   1318         test(u'result', u'x<6', u'result')
   1319         test(u'result', u'x<7', u'resultx')
   1320         test(u'result', u'x<8', u'resultxx')
   1321         test(u'result', u' <7', u'result ')
   1322         test(u'result', u'<7', u'result ')
   1323         test(u'result', u'>7', u' result')
   1324         test(u'result', u'>8', u'  result')
   1325         test(u'result', u'^8', u' result ')
   1326         test(u'result', u'^9', u' result  ')
   1327         test(u'result', u'^10', u'  result  ')
   1328         test(u'a', u'10000', u'a' + u' ' * 9999)
   1329         test(u'', u'10000', u' ' * 10000)
   1330         test(u'', u'10000000', u' ' * 10000000)
   1331 
   1332         # test mixing unicode and str
   1333         self.assertEqual(u'abc'.__format__('s'), u'abc')
   1334         self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
   1335 
   1336     def test_format(self):
   1337         self.assertEqual(u''.format(), u'')
   1338         self.assertEqual(u'a'.format(), u'a')
   1339         self.assertEqual(u'ab'.format(), u'ab')
   1340         self.assertEqual(u'a{{'.format(), u'a{')
   1341         self.assertEqual(u'a}}'.format(), u'a}')
   1342         self.assertEqual(u'{{b'.format(), u'{b')
   1343         self.assertEqual(u'}}b'.format(), u'}b')
   1344         self.assertEqual(u'a{{b'.format(), u'a{b')
   1345 
   1346         # examples from the PEP:
   1347         import datetime
   1348         self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
   1349         self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
   1350                          u"My name is Fred")
   1351         self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
   1352                          u"My name is Fred :-{}")
   1353 
   1354         # datetime.__format__ doesn't work with unicode
   1355         #d = datetime.date(2007, 8, 18)
   1356         #self.assertEqual("The year is {0.year}".format(d),
   1357         #                 "The year is 2007")
   1358 
   1359         # classes we'll use for testing
   1360         class C:
   1361             def __init__(self, x=100):
   1362                 self._x = x
   1363             def __format__(self, spec):
   1364                 return spec
   1365 
   1366         class D:
   1367             def __init__(self, x):
   1368                 self.x = x
   1369             def __format__(self, spec):
   1370                 return str(self.x)
   1371 
   1372         # class with __str__, but no __format__
   1373         class E:
   1374             def __init__(self, x):
   1375                 self.x = x
   1376             def __str__(self):
   1377                 return u'E(' + self.x + u')'
   1378 
   1379         # class with __repr__, but no __format__ or __str__
   1380         class F:
   1381             def __init__(self, x):
   1382                 self.x = x
   1383             def __repr__(self):
   1384                 return u'F(' + self.x + u')'
   1385 
   1386         # class with __format__ that forwards to string, for some format_spec's
   1387         class G:
   1388             def __init__(self, x):
   1389                 self.x = x
   1390             def __str__(self):
   1391                 return u"string is " + self.x
   1392             def __format__(self, format_spec):
   1393                 if format_spec == 'd':
   1394                     return u'G(' + self.x + u')'
   1395                 return object.__format__(self, format_spec)
   1396 
   1397         # class that returns a bad type from __format__
   1398         class H:
   1399             def __format__(self, format_spec):
   1400                 return 1.0
   1401 
   1402         class I(datetime.date):
   1403             def __format__(self, format_spec):
   1404                 return self.strftime(format_spec)
   1405 
   1406         class J(int):
   1407             def __format__(self, format_spec):
   1408                 return int.__format__(self * 2, format_spec)
   1409 
   1410 
   1411         self.assertEqual(u''.format(), u'')
   1412         self.assertEqual(u'abc'.format(), u'abc')
   1413         self.assertEqual(u'{0}'.format(u'abc'), u'abc')
   1414         self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
   1415         self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
   1416         self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
   1417         self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
   1418         self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
   1419         self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
   1420         self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
   1421         self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
   1422         self.assertEqual(u'{0}'.format(-15), u'-15')
   1423         self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
   1424         self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
   1425         self.assertEqual(u'{{'.format(), u'{')
   1426         self.assertEqual(u'}}'.format(), u'}')
   1427         self.assertEqual(u'{{}}'.format(), u'{}')
   1428         self.assertEqual(u'{{x}}'.format(), u'{x}')
   1429         self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
   1430         self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
   1431         self.assertEqual(u'}}{{'.format(), u'}{')
   1432         self.assertEqual(u'}}x{{'.format(), u'}x{')
   1433 
   1434         # weird field names
   1435         self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
   1436         self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
   1437         self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
   1438 
   1439         self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
   1440         self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
   1441         self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
   1442         self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
   1443         self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
   1444         self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
   1445         self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
   1446 
   1447         # strings
   1448         self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
   1449         self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
   1450         self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
   1451         self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
   1452         self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
   1453         self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
   1454         self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
   1455         self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
   1456         self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
   1457         self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
   1458         self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
   1459         self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
   1460         self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
   1461         self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
   1462         self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
   1463         self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
   1464         self.assertEqual(u'{0:>8s}'.format(u'result'), u'  result')
   1465         self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
   1466         self.assertEqual(u'{0:^9s}'.format(u'result'), u' result  ')
   1467         self.assertEqual(u'{0:^10s}'.format(u'result'), u'  result  ')
   1468         self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
   1469         self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
   1470         self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
   1471 
   1472         # issue 12546: use \x00 as a fill character
   1473         self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
   1474         self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
   1475         self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
   1476         self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
   1477 
   1478         self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
   1479         self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
   1480         self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
   1481         self.assertEqual('{0:<6}'.format(3), '3     ')
   1482 
   1483         self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
   1484         self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
   1485         self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
   1486         self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
   1487 
   1488         self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
   1489         self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
   1490         self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
   1491         self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
   1492 
   1493         # format specifiers for user defined type
   1494         self.assertEqual(u'{0:abc}'.format(C()), u'abc')
   1495 
   1496         # !r and !s coercions
   1497         self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
   1498         self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
   1499         self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello          ')
   1500         self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello          ')
   1501         self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
   1502         self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
   1503         self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
   1504 
   1505         # test fallback to object.__format__
   1506         self.assertEqual(u'{0}'.format({}), u'{}')
   1507         self.assertEqual(u'{0}'.format([]), u'[]')
   1508         self.assertEqual(u'{0}'.format([1]), u'[1]')
   1509         self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
   1510         self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
   1511         self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
   1512 
   1513         msg = 'object.__format__ with a non-empty format string is deprecated'
   1514         with test_support.check_warnings((msg, PendingDeprecationWarning)):
   1515             self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data)  ')
   1516             self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data)  ')
   1517             self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
   1518 
   1519         self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
   1520                                                         month=8,
   1521                                                         day=27)),
   1522                          u"date: 2007-08-27")
   1523 
   1524         # test deriving from a builtin type and overriding __format__
   1525         self.assertEqual(u"{0}".format(J(10)), u"20")
   1526 
   1527 
   1528         # string format specifiers
   1529         self.assertEqual(u'{0:}'.format('a'), u'a')
   1530 
   1531         # computed format specifiers
   1532         self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
   1533         self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
   1534         self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
   1535         self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello     ')
   1536         self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello     ')
   1537 
   1538         # test various errors
   1539         self.assertRaises(ValueError, u'{'.format)
   1540         self.assertRaises(ValueError, u'}'.format)
   1541         self.assertRaises(ValueError, u'a{'.format)
   1542         self.assertRaises(ValueError, u'a}'.format)
   1543         self.assertRaises(ValueError, u'{a'.format)
   1544         self.assertRaises(ValueError, u'}a'.format)
   1545         self.assertRaises(IndexError, u'{0}'.format)
   1546         self.assertRaises(IndexError, u'{1}'.format, u'abc')
   1547         self.assertRaises(KeyError,   u'{x}'.format)
   1548         self.assertRaises(ValueError, u"}{".format)
   1549         self.assertRaises(ValueError, u"{".format)
   1550         self.assertRaises(ValueError, u"}".format)
   1551         self.assertRaises(ValueError, u"abc{0:{}".format)
   1552         self.assertRaises(ValueError, u"{0".format)
   1553         self.assertRaises(IndexError, u"{0.}".format)
   1554         self.assertRaises(ValueError, u"{0.}".format, 0)
   1555         self.assertRaises(IndexError, u"{0[}".format)
   1556         self.assertRaises(ValueError, u"{0[}".format, [])
   1557         self.assertRaises(KeyError,   u"{0]}".format)
   1558         self.assertRaises(ValueError, u"{0.[]}".format, 0)
   1559         self.assertRaises(ValueError, u"{0..foo}".format, 0)
   1560         self.assertRaises(ValueError, u"{0[0}".format, 0)
   1561         self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
   1562         self.assertRaises(KeyError,   u"{c]}".format)
   1563         self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
   1564         self.assertRaises(ValueError, u"{0}}".format, 0)
   1565         self.assertRaises(KeyError,   u"{foo}".format, bar=3)
   1566         self.assertRaises(ValueError, u"{0!x}".format, 3)
   1567         self.assertRaises(ValueError, u"{0!}".format, 0)
   1568         self.assertRaises(ValueError, u"{0!rs}".format, 0)
   1569         self.assertRaises(ValueError, u"{!}".format)
   1570         self.assertRaises(IndexError, u"{:}".format)
   1571         self.assertRaises(IndexError, u"{:s}".format)
   1572         self.assertRaises(IndexError, u"{}".format)
   1573         big = u"23098475029384702983476098230754973209482573"
   1574         self.assertRaises(ValueError, (u"{" + big + u"}").format)
   1575         self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
   1576 
   1577         # issue 6089
   1578         self.assertRaises(ValueError, u"{0[0]x}".format, [None])
   1579         self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
   1580 
   1581         # can't have a replacement on the field name portion
   1582         self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
   1583 
   1584         # exceed maximum recursion depth
   1585         self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
   1586         self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
   1587                           0, 1, 2, 3, 4, 5, 6, 7)
   1588 
   1589         # string format spec errors
   1590         self.assertRaises(ValueError, u"{0:-s}".format, u'')
   1591         self.assertRaises(ValueError, format, u"", u"-")
   1592         self.assertRaises(ValueError, u"{0:=s}".format, u'')
   1593 
   1594         # test combining string and unicode
   1595         self.assertEqual(u"foo{0}".format('bar'), u'foobar')
   1596         # This will try to convert the argument from unicode to str, which
   1597         #  will succeed
   1598         self.assertEqual("foo{0}".format(u'bar'), 'foobar')
   1599         # This will try to convert the argument from unicode to str, which
   1600         #  will fail
   1601         self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
   1602 
   1603     def test_format_huge_precision(self):
   1604         format_string = u".{}f".format(sys.maxsize + 1)
   1605         with self.assertRaises(ValueError):
   1606             result = format(2.34, format_string)
   1607 
   1608     def test_format_huge_width(self):
   1609         format_string = u"{}f".format(sys.maxsize + 1)
   1610         with self.assertRaises(ValueError):
   1611             result = format(2.34, format_string)
   1612 
   1613     def test_format_huge_item_number(self):
   1614         format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
   1615         with self.assertRaises(ValueError):
   1616             result = format_string.format(2.34)
   1617 
   1618     def test_format_auto_numbering(self):
   1619         class C:
   1620             def __init__(self, x=100):
   1621                 self._x = x
   1622             def __format__(self, spec):
   1623                 return spec
   1624 
   1625         self.assertEqual(u'{}'.format(10), u'10')
   1626         self.assertEqual(u'{:5}'.format('s'), u's    ')
   1627         self.assertEqual(u'{!r}'.format('s'), u"'s'")
   1628         self.assertEqual(u'{._x}'.format(C(10)), u'10')
   1629         self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
   1630         self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
   1631         self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
   1632 
   1633         self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a    x     b')
   1634         self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
   1635 
   1636         # can't mix and match numbering and auto-numbering
   1637         self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
   1638         self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
   1639         self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
   1640         self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
   1641 
   1642         # can mix and match auto-numbering and named
   1643         self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
   1644         self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
   1645         self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
   1646         self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
   1647 
   1648     def test_raiseMemError(self):
   1649         # Ensure that the freelist contains a consistent object, even
   1650         # when a string allocation fails with a MemoryError.
   1651         # This used to crash the interpreter,
   1652         # or leak references when the number was smaller.
   1653         charwidth = 4 if sys.maxunicode >= 0x10000 else 2
   1654         # Note: sys.maxsize is half of the actual max allocation because of
   1655         # the signedness of Py_ssize_t.
   1656         alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
   1657         self.assertRaises(MemoryError, alloc)
   1658         self.assertRaises(MemoryError, alloc)
   1659 
   1660     def test_format_subclass(self):
   1661         class U(unicode):
   1662             def __unicode__(self):
   1663                 return u'__unicode__ overridden'
   1664         u = U(u'xxx')
   1665         self.assertEqual("%s" % u, u'__unicode__ overridden')
   1666         self.assertEqual("{}".format(u), '__unicode__ overridden')
   1667 
   1668     def test_free_after_iterating(self):
   1669         test_support.check_free_after_iterating(self, iter, unicode)
   1670         test_support.check_free_after_iterating(self, reversed, unicode)
   1671 
   1672 
   1673 class CAPITest(unittest.TestCase):
   1674 
   1675     # Test PyUnicode_FromFormat()
   1676     def test_from_format(self):
   1677         test_support.import_module('ctypes')
   1678         from ctypes import (
   1679             pythonapi, py_object, sizeof,
   1680             c_int, c_long, c_longlong, c_ssize_t,
   1681             c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
   1682         if sys.maxunicode == 0xffff:
   1683             name = "PyUnicodeUCS2_FromFormat"
   1684         else:
   1685             name = "PyUnicodeUCS4_FromFormat"
   1686         _PyUnicode_FromFormat = getattr(pythonapi, name)
   1687         _PyUnicode_FromFormat.restype = py_object
   1688 
   1689         def PyUnicode_FromFormat(format, *args):
   1690             cargs = tuple(
   1691                 py_object(arg) if isinstance(arg, unicode) else arg
   1692                 for arg in args)
   1693             return _PyUnicode_FromFormat(format, *cargs)
   1694 
   1695         def check_format(expected, format, *args):
   1696             text = PyUnicode_FromFormat(format, *args)
   1697             self.assertEqual(expected, text)
   1698 
   1699         # ascii format, non-ascii argument
   1700         check_format(u'ascii\x7f=unicode\xe9',
   1701                      b'ascii\x7f=%U', u'unicode\xe9')
   1702 
   1703         # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
   1704         # raises an error
   1705         #self.assertRaisesRegex(ValueError,
   1706         #    '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
   1707         #    'string, got a non-ASCII byte: 0xe9$',
   1708         #    PyUnicode_FromFormat, b'unicode\xe9=%s', u'ascii')
   1709 
   1710         # test "%c"
   1711         check_format(u'\uabcd',
   1712                      b'%c', c_int(0xabcd))
   1713         if sys.maxunicode > 0xffff:
   1714             check_format(u'\U0010ffff',
   1715                          b'%c', c_int(0x10ffff))
   1716         else:
   1717             with self.assertRaises(OverflowError):
   1718                 PyUnicode_FromFormat(b'%c', c_int(0x10000))
   1719         with self.assertRaises(OverflowError):
   1720             PyUnicode_FromFormat(b'%c', c_int(0x110000))
   1721         # Issue #18183
   1722         if sys.maxunicode > 0xffff:
   1723             check_format(u'\U00010000\U00100000',
   1724                          b'%c%c', c_int(0x10000), c_int(0x100000))
   1725 
   1726         # test "%"
   1727         check_format(u'%',
   1728                      b'%')
   1729         check_format(u'%',
   1730                      b'%%')
   1731         check_format(u'%s',
   1732                      b'%%s')
   1733         check_format(u'[%]',
   1734                      b'[%%]')
   1735         check_format(u'%abc',
   1736                      b'%%%s', b'abc')
   1737 
   1738         # test %S
   1739         check_format(u"repr=abc",
   1740                      b'repr=%S', u'abc')
   1741 
   1742         # test %R
   1743         check_format(u"repr=u'abc'",
   1744                      b'repr=%R', u'abc')
   1745 
   1746         # test integer formats (%i, %d, %u)
   1747         check_format(u'010',
   1748                      b'%03i', c_int(10))
   1749         check_format(u'0010',
   1750                      b'%0.4i', c_int(10))
   1751         check_format(u'-123',
   1752                      b'%i', c_int(-123))
   1753 
   1754         check_format(u'-123',
   1755                      b'%d', c_int(-123))
   1756         check_format(u'-123',
   1757                      b'%ld', c_long(-123))
   1758         check_format(u'-123',
   1759                      b'%zd', c_ssize_t(-123))
   1760 
   1761         check_format(u'123',
   1762                      b'%u', c_uint(123))
   1763         check_format(u'123',
   1764                      b'%lu', c_ulong(123))
   1765         check_format(u'123',
   1766                      b'%zu', c_size_t(123))
   1767 
   1768         # test long output
   1769         min_long = -(2 ** (8 * sizeof(c_long) - 1))
   1770         max_long = -min_long - 1
   1771         check_format(unicode(min_long),
   1772                      b'%ld', c_long(min_long))
   1773         check_format(unicode(max_long),
   1774                      b'%ld', c_long(max_long))
   1775         max_ulong = 2 ** (8 * sizeof(c_ulong)) - 1
   1776         check_format(unicode(max_ulong),
   1777                      b'%lu', c_ulong(max_ulong))
   1778         PyUnicode_FromFormat(b'%p', c_void_p(-1))
   1779 
   1780         # test padding (width and/or precision)
   1781         check_format(u'123'.rjust(10, u'0'),
   1782                      b'%010i', c_int(123))
   1783         check_format(u'123'.rjust(100),
   1784                      b'%100i', c_int(123))
   1785         check_format(u'123'.rjust(100, u'0'),
   1786                      b'%.100i', c_int(123))
   1787         check_format(u'123'.rjust(80, u'0').rjust(100),
   1788                      b'%100.80i', c_int(123))
   1789 
   1790         check_format(u'123'.rjust(10, u'0'),
   1791                      b'%010u', c_uint(123))
   1792         check_format(u'123'.rjust(100),
   1793                      b'%100u', c_uint(123))
   1794         check_format(u'123'.rjust(100, u'0'),
   1795                      b'%.100u', c_uint(123))
   1796         check_format(u'123'.rjust(80, u'0').rjust(100),
   1797                      b'%100.80u', c_uint(123))
   1798 
   1799         check_format(u'123'.rjust(10, u'0'),
   1800                      b'%010x', c_int(0x123))
   1801         check_format(u'123'.rjust(100),
   1802                      b'%100x', c_int(0x123))
   1803         check_format(u'123'.rjust(100, u'0'),
   1804                      b'%.100x', c_int(0x123))
   1805         check_format(u'123'.rjust(80, u'0').rjust(100),
   1806                      b'%100.80x', c_int(0x123))
   1807 
   1808         # test %V
   1809         check_format(u'repr=abc',
   1810                      b'repr=%V', u'abc', b'xyz')
   1811         check_format(u'repr=\xe4\xba\xba\xe6\xb0\x91',
   1812                      b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
   1813         check_format(u'repr=abc\xff',
   1814                      b'repr=%V', None, b'abc\xff')
   1815 
   1816         # not supported: copy the raw format string. these tests are just here
   1817         # to check for crashs and should not be considered as specifications
   1818         check_format(u'%s',
   1819                      b'%1%s', b'abc')
   1820         check_format(u'%1abc',
   1821                      b'%1abc')
   1822         check_format(u'%+i',
   1823                      b'%+i', c_int(10))
   1824         check_format(u'%s',
   1825                      b'%.%s', b'abc')
   1826 
   1827     @test_support.cpython_only
   1828     def test_encode_decimal(self):
   1829         from _testcapi import unicode_encodedecimal
   1830         self.assertEqual(unicode_encodedecimal(u'123'),
   1831                          b'123')
   1832         self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
   1833                          b'3.14')
   1834         self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
   1835                          b' 3.14 ')
   1836         self.assertRaises(UnicodeEncodeError,
   1837                           unicode_encodedecimal, u"123\u20ac", "strict")
   1838         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
   1839                          b'123?')
   1840         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
   1841                          b'123')
   1842         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
   1843                          b'123&#8364;')
   1844         self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
   1845                          b'123\\u20ac')
   1846         self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
   1847                          b'123? ')
   1848         self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
   1849                          b'123??')
   1850         self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
   1851                          b'123?0')
   1852 
   1853     @test_support.cpython_only
   1854     def test_encode_decimal_with_surrogates(self):
   1855         from _testcapi import unicode_encodedecimal
   1856         tests = [(u'\U0001f49d', '&#128157;'),
   1857                  (u'\ud83d', '&#55357;'),
   1858                  (u'\udc9d', '&#56477;'),
   1859                 ]
   1860         if u'\ud83d\udc9d' != u'\U0001f49d':
   1861             tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
   1862         for s, exp in tests:
   1863             self.assertEqual(
   1864                     unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
   1865                     '123' + exp)
   1866 
   1867 def test_main():
   1868     test_support.run_unittest(__name__)
   1869 
   1870 if __name__ == "__main__":
   1871     test_main()
   1872