Home | History | Annotate | Download | only in test
      1 """ Test script for the Unicode implementation.
      2 
      3 Written by Bill Tutt.
      4 Modified for Python 2.0 by Fredrik Lundh (fredrik (at] pythonware.com)
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import unittest
     11 import sys
     12 import _testcapi
     13 
     14 from test import test_support
     15 
     16 class UnicodeNamesTest(unittest.TestCase):
     17 
     18     def checkletter(self, name, code):
     19         # Helper that put all \N escapes inside eval'd raw strings,
     20         # to make sure this script runs even if the compiler
     21         # chokes on \N escapes
     22         res = eval(ur'u"\N{%s}"' % name)
     23         self.assertEqual(res, code)
     24         return res
     25 
     26     def test_general(self):
     27         # General and case insensitivity test:
     28         chars = [
     29             "LATIN CAPITAL LETTER T",
     30             "LATIN SMALL LETTER H",
     31             "LATIN SMALL LETTER E",
     32             "SPACE",
     33             "LATIN SMALL LETTER R",
     34             "LATIN CAPITAL LETTER E",
     35             "LATIN SMALL LETTER D",
     36             "SPACE",
     37             "LATIN SMALL LETTER f",
     38             "LATIN CAPITAL LeTtEr o",
     39             "LATIN SMaLl LETTER x",
     40             "SPACE",
     41             "LATIN SMALL LETTER A",
     42             "LATIN SMALL LETTER T",
     43             "LATIN SMALL LETTER E",
     44             "SPACE",
     45             "LATIN SMALL LETTER T",
     46             "LATIN SMALL LETTER H",
     47             "LATIN SMALL LETTER E",
     48             "SpAcE",
     49             "LATIN SMALL LETTER S",
     50             "LATIN SMALL LETTER H",
     51             "LATIN small LETTER e",
     52             "LATIN small LETTER e",
     53             "LATIN SMALL LETTER P",
     54             "FULL STOP"
     55         ]
     56         string = u"The rEd fOx ate the sheep."
     57 
     58         self.assertEqual(
     59             u"".join([self.checkletter(*args) for args in zip(chars, string)]),
     60             string
     61         )
     62 
     63     def test_ascii_letters(self):
     64         import unicodedata
     65 
     66         for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
     67             name = "LATIN SMALL LETTER %s" % char.upper()
     68             code = unicodedata.lookup(name)
     69             self.assertEqual(unicodedata.name(code), name)
     70 
     71     def test_hangul_syllables(self):
     72         self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
     73         self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
     74         self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
     75         self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
     76         self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
     77         self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
     78         self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
     79         self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
     80         self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
     81         self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
     82         self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
     83         self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
     84         self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
     85 
     86         import unicodedata
     87         self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
     88 
     89     def test_cjk_unified_ideographs(self):
     90         self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
     91         self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
     92         self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
     93         self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
     94         self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
     95         self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
     96 
     97     def test_bmp_characters(self):
     98         import unicodedata
     99         count = 0
    100         for code in xrange(0x10000):
    101             char = unichr(code)
    102             name = unicodedata.name(char, None)
    103             if name is not None:
    104                 self.assertEqual(unicodedata.lookup(name), char)
    105                 count += 1
    106 
    107     def test_misc_symbols(self):
    108         self.checkletter("PILCROW SIGN", u"\u00b6")
    109         self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
    110         self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
    111         self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
    112 
    113     def test_errors(self):
    114         import unicodedata
    115         self.assertRaises(TypeError, unicodedata.name)
    116         self.assertRaises(TypeError, unicodedata.name, u'xx')
    117         self.assertRaises(TypeError, unicodedata.lookup)
    118         self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
    119 
    120     def test_strict_eror_handling(self):
    121         # bogus character name
    122         self.assertRaises(
    123             UnicodeError,
    124             unicode, "\\N{blah}", 'unicode-escape', 'strict'
    125         )
    126         # long bogus character name
    127         self.assertRaises(
    128             UnicodeError,
    129             unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
    130         )
    131         # missing closing brace
    132         self.assertRaises(
    133             UnicodeError,
    134             unicode, "\\N{SPACE", 'unicode-escape', 'strict'
    135         )
    136         # missing opening brace
    137         self.assertRaises(
    138             UnicodeError,
    139             unicode, "\\NSPACE", 'unicode-escape', 'strict'
    140         )
    141 
    142     @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
    143                          "needs UINT_MAX < SIZE_MAX")
    144     @unittest.skipUnless(_testcapi.UINT_MAX < sys.maxint,
    145                          "needs UINT_MAX < sys.maxint")
    146     @test_support.bigmemtest(minsize=_testcapi.UINT_MAX + 1,
    147                              memuse=2 + 4 // len(u'\U00010000'))
    148     def test_issue16335(self, size):
    149         func = self.test_issue16335
    150         if size < func.minsize:
    151             raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
    152                     (func.minsize * func.memuse / float(1024**3),))
    153         # very very long bogus character name
    154         x = b'\\N{SPACE' + b'x' * int(_testcapi.UINT_MAX + 1) + b'}'
    155         self.assertEqual(len(x), len(b'\\N{SPACE}') +
    156                                     (_testcapi.UINT_MAX + 1))
    157         self.assertRaisesRegexp(UnicodeError,
    158             'unknown Unicode character name',
    159             x.decode, 'unicode-escape'
    160         )
    161 
    162 
    163 def test_main():
    164     test_support.run_unittest(UnicodeNamesTest)
    165 
    166 if __name__ == "__main__":
    167     test_main()
    168