1 """ Test script for the Unicode implementation. 2 3 Written by Bill Tutt. 4 Modified for Python 2.0 by Fredrik Lundh (fredrik (at] pythonware.com) 5 6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8 """#" 9 10 import unittest 11 import sys 12 import _testcapi 13 14 from test import test_support 15 16 class UnicodeNamesTest(unittest.TestCase): 17 18 def checkletter(self, name, code): 19 # Helper that put all \N escapes inside eval'd raw strings, 20 # to make sure this script runs even if the compiler 21 # chokes on \N escapes 22 res = eval(ur'u"\N{%s}"' % name) 23 self.assertEqual(res, code) 24 return res 25 26 def test_general(self): 27 # General and case insensitivity test: 28 chars = [ 29 "LATIN CAPITAL LETTER T", 30 "LATIN SMALL LETTER H", 31 "LATIN SMALL LETTER E", 32 "SPACE", 33 "LATIN SMALL LETTER R", 34 "LATIN CAPITAL LETTER E", 35 "LATIN SMALL LETTER D", 36 "SPACE", 37 "LATIN SMALL LETTER f", 38 "LATIN CAPITAL LeTtEr o", 39 "LATIN SMaLl LETTER x", 40 "SPACE", 41 "LATIN SMALL LETTER A", 42 "LATIN SMALL LETTER T", 43 "LATIN SMALL LETTER E", 44 "SPACE", 45 "LATIN SMALL LETTER T", 46 "LATIN SMALL LETTER H", 47 "LATIN SMALL LETTER E", 48 "SpAcE", 49 "LATIN SMALL LETTER S", 50 "LATIN SMALL LETTER H", 51 "LATIN small LETTER e", 52 "LATIN small LETTER e", 53 "LATIN SMALL LETTER P", 54 "FULL STOP" 55 ] 56 string = u"The rEd fOx ate the sheep." 57 58 self.assertEqual( 59 u"".join([self.checkletter(*args) for args in zip(chars, string)]), 60 string 61 ) 62 63 def test_ascii_letters(self): 64 import unicodedata 65 66 for char in "".join(map(chr, xrange(ord("a"), ord("z")))): 67 name = "LATIN SMALL LETTER %s" % char.upper() 68 code = unicodedata.lookup(name) 69 self.assertEqual(unicodedata.name(code), name) 70 71 def test_hangul_syllables(self): 72 self.checkletter("HANGUL SYLLABLE GA", u"\uac00") 73 self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8") 74 self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0") 75 self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8") 76 self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0") 77 self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88") 78 self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370") 79 self.checkletter("HANGUL SYLLABLE YI", u"\uc758") 80 self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40") 81 self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28") 82 self.checkletter("HANGUL SYLLABLE PAN", u"\ud310") 83 self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8") 84 self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3") 85 86 import unicodedata 87 self.assertRaises(ValueError, unicodedata.name, u"\ud7a4") 88 89 def test_cjk_unified_ideographs(self): 90 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400") 91 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5") 92 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00") 93 self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5") 94 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000") 95 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6") 96 97 def test_bmp_characters(self): 98 import unicodedata 99 count = 0 100 for code in xrange(0x10000): 101 char = unichr(code) 102 name = unicodedata.name(char, None) 103 if name is not None: 104 self.assertEqual(unicodedata.lookup(name), char) 105 count += 1 106 107 def test_misc_symbols(self): 108 self.checkletter("PILCROW SIGN", u"\u00b6") 109 self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD") 110 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F") 111 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41") 112 113 def test_errors(self): 114 import unicodedata 115 self.assertRaises(TypeError, unicodedata.name) 116 self.assertRaises(TypeError, unicodedata.name, u'xx') 117 self.assertRaises(TypeError, unicodedata.lookup) 118 self.assertRaises(KeyError, unicodedata.lookup, u'unknown') 119 120 def test_strict_eror_handling(self): 121 # bogus character name 122 self.assertRaises( 123 UnicodeError, 124 unicode, "\\N{blah}", 'unicode-escape', 'strict' 125 ) 126 # long bogus character name 127 self.assertRaises( 128 UnicodeError, 129 unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict' 130 ) 131 # missing closing brace 132 self.assertRaises( 133 UnicodeError, 134 unicode, "\\N{SPACE", 'unicode-escape', 'strict' 135 ) 136 # missing opening brace 137 self.assertRaises( 138 UnicodeError, 139 unicode, "\\NSPACE", 'unicode-escape', 'strict' 140 ) 141 142 @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX, 143 "needs UINT_MAX < SIZE_MAX") 144 @unittest.skipUnless(_testcapi.UINT_MAX < sys.maxint, 145 "needs UINT_MAX < sys.maxint") 146 @test_support.bigmemtest(minsize=_testcapi.UINT_MAX + 1, 147 memuse=2 + 4 // len(u'\U00010000')) 148 def test_issue16335(self, size): 149 func = self.test_issue16335 150 if size < func.minsize: 151 raise unittest.SkipTest("not enough memory: %.1fG minimum needed" % 152 (func.minsize * func.memuse / float(1024**3),)) 153 # very very long bogus character name 154 x = b'\\N{SPACE' + b'x' * int(_testcapi.UINT_MAX + 1) + b'}' 155 self.assertEqual(len(x), len(b'\\N{SPACE}') + 156 (_testcapi.UINT_MAX + 1)) 157 self.assertRaisesRegexp(UnicodeError, 158 'unknown Unicode character name', 159 x.decode, 'unicode-escape' 160 ) 161 162 163 def test_main(): 164 test_support.run_unittest(UnicodeNamesTest) 165 166 if __name__ == "__main__": 167 test_main() 168