1 """ Test script for the Unicode implementation. 2 3 Written by Bill Tutt. 4 Modified for Python 2.0 by Fredrik Lundh (fredrik (at] pythonware.com) 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import unittest 11 import sys 12 13 from test import test_support 14 15 try: 16 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX 17 except ImportError: 18 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1 19 20 class UnicodeNamesTest(unittest.TestCase): 21 22 def checkletter(self, name, code): 23 # Helper that put all \N escapes inside eval'd raw strings, 24 # to make sure this script runs even if the compiler 25 # chokes on \N escapes 26 res = eval(ur'u"\N{%s}"' % name) 27 self.assertEqual(res, code) 28 return res 29 30 def test_general(self): 31 # General and case insensitivity test: 32 chars = [ 33 "LATIN CAPITAL LETTER T", 34 "LATIN SMALL LETTER H", 35 "LATIN SMALL LETTER E", 36 "SPACE", 37 "LATIN SMALL LETTER R", 38 "LATIN CAPITAL LETTER E", 39 "LATIN SMALL LETTER D", 40 "SPACE", 41 "LATIN SMALL LETTER f", 42 "LATIN CAPITAL LeTtEr o", 43 "LATIN SMaLl LETTER x", 44 "SPACE", 45 "LATIN SMALL LETTER A", 46 "LATIN SMALL LETTER T", 47 "LATIN SMALL LETTER E", 48 "SPACE", 49 "LATIN SMALL LETTER T", 50 "LATIN SMALL LETTER H", 51 "LATIN SMALL LETTER E", 52 "SpAcE", 53 "LATIN SMALL LETTER S", 54 "LATIN SMALL LETTER H", 55 "LATIN small LETTER e", 56 "LATIN small LETTER e", 57 "LATIN SMALL LETTER P", 58 "FULL STOP" 59 ] 60 string = u"The rEd fOx ate the sheep." 61 62 self.assertEqual( 63 u"".join([self.checkletter(*args) for args in zip(chars, string)]), 64 string 65 ) 66 67 def test_ascii_letters(self): 68 import unicodedata 69 70 for char in "".join(map(chr, xrange(ord("a"), ord("z")))): 71 name = "LATIN SMALL LETTER %s" % char.upper() 72 code = unicodedata.lookup(name) 73 self.assertEqual(unicodedata.name(code), name) 74 75 def test_hangul_syllables(self): 76 self.checkletter("HANGUL SYLLABLE GA", u"\uac00") 77 self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8") 78 self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0") 79 self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8") 80 self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0") 81 self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88") 82 self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370") 83 self.checkletter("HANGUL SYLLABLE YI", u"\uc758") 84 self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40") 85 self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28") 86 self.checkletter("HANGUL SYLLABLE PAN", u"\ud310") 87 self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8") 88 self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3") 89 90 import unicodedata 91 self.assertRaises(ValueError, unicodedata.name, u"\ud7a4") 92 93 def test_cjk_unified_ideographs(self): 94 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400") 95 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5") 96 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00") 97 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5") 98 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000") 99 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6") 100 101 def test_bmp_characters(self): 102 import unicodedata 103 count = 0 104 for code in xrange(0x10000): 105 char = unichr(code) 106 name = unicodedata.name(char, None) 107 if name is not None: 108 self.assertEqual(unicodedata.lookup(name), char) 109 count += 1 110 111 def test_misc_symbols(self): 112 self.checkletter("PILCROW SIGN", u"\u00b6") 113 self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD") 114 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F") 115 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41") 116 117 def test_errors(self): 118 import unicodedata 119 self.assertRaises(TypeError, unicodedata.name) 120 self.assertRaises(TypeError, unicodedata.name, u'xx') 121 self.assertRaises(TypeError, unicodedata.lookup) 122 self.assertRaises(KeyError, unicodedata.lookup, u'unknown') 123 124 def test_strict_eror_handling(self): 125 # bogus character name 126 self.assertRaises( 127 UnicodeError, 128 unicode, "\\N{blah}", 'unicode-escape', 'strict' 129 ) 130 # long bogus character name 131 self.assertRaises( 132 UnicodeError, 133 unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict' 134 ) 135 # missing closing brace 136 self.assertRaises( 137 UnicodeError, 138 unicode, "\\N{SPACE", 'unicode-escape', 'strict' 139 ) 140 # missing opening brace 141 self.assertRaises( 142 UnicodeError, 143 unicode, "\\NSPACE", 'unicode-escape', 'strict' 144 ) 145 146 @test_support.cpython_only 147 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX") 148 @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint") 149 @test_support.bigmemtest(minsize=UINT_MAX + 1, 150 memuse=2 + 4 // len(u'\U00010000')) 151 def test_issue16335(self, size): 152 func = self.test_issue16335 153 if size < func.minsize: 154 raise unittest.SkipTest("not enough memory: %.1fG minimum needed" % 155 (func.minsize * func.memuse / float(1024**3),)) 156 # very very long bogus character name 157 x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}' 158 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1)) 159 self.assertRaisesRegexp(UnicodeError, 160 'unknown Unicode character name', 161 x.decode, 'unicode-escape' 162 ) 163 164 165 def test_main(): 166 test_support.run_unittest(UnicodeNamesTest) 167 168 if __name__ == "__main__": 169 test_main() 170