Home | History | Annotate | Download | only in test
      1 """ Test script for the Unicode implementation.
      2 
      3 Written by Bill Tutt.
      4 Modified for Python 2.0 by Fredrik Lundh (fredrik (at] pythonware.com)
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import unittest
     11 import sys
     12 
     13 from test import test_support
     14 
     15 try:
     16     from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
     17 except ImportError:
     18     INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
     19 
     20 class UnicodeNamesTest(unittest.TestCase):
     21 
     22     def checkletter(self, name, code):
     23         # Helper that put all \N escapes inside eval'd raw strings,
     24         # to make sure this script runs even if the compiler
     25         # chokes on \N escapes
     26         res = eval(ur'u"\N{%s}"' % name)
     27         self.assertEqual(res, code)
     28         return res
     29 
     30     def test_general(self):
     31         # General and case insensitivity test:
     32         chars = [
     33             "LATIN CAPITAL LETTER T",
     34             "LATIN SMALL LETTER H",
     35             "LATIN SMALL LETTER E",
     36             "SPACE",
     37             "LATIN SMALL LETTER R",
     38             "LATIN CAPITAL LETTER E",
     39             "LATIN SMALL LETTER D",
     40             "SPACE",
     41             "LATIN SMALL LETTER f",
     42             "LATIN CAPITAL LeTtEr o",
     43             "LATIN SMaLl LETTER x",
     44             "SPACE",
     45             "LATIN SMALL LETTER A",
     46             "LATIN SMALL LETTER T",
     47             "LATIN SMALL LETTER E",
     48             "SPACE",
     49             "LATIN SMALL LETTER T",
     50             "LATIN SMALL LETTER H",
     51             "LATIN SMALL LETTER E",
     52             "SpAcE",
     53             "LATIN SMALL LETTER S",
     54             "LATIN SMALL LETTER H",
     55             "LATIN small LETTER e",
     56             "LATIN small LETTER e",
     57             "LATIN SMALL LETTER P",
     58             "FULL STOP"
     59         ]
     60         string = u"The rEd fOx ate the sheep."
     61 
     62         self.assertEqual(
     63             u"".join([self.checkletter(*args) for args in zip(chars, string)]),
     64             string
     65         )
     66 
     67     def test_ascii_letters(self):
     68         import unicodedata
     69 
     70         for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
     71             name = "LATIN SMALL LETTER %s" % char.upper()
     72             code = unicodedata.lookup(name)
     73             self.assertEqual(unicodedata.name(code), name)
     74 
     75     def test_hangul_syllables(self):
     76         self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
     77         self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
     78         self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
     79         self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
     80         self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
     81         self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
     82         self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
     83         self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
     84         self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
     85         self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
     86         self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
     87         self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
     88         self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
     89 
     90         import unicodedata
     91         self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
     92 
     93     def test_cjk_unified_ideographs(self):
     94         self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
     95         self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
     96         self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
     97         self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
     98         self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
     99         self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
    100 
    101     def test_bmp_characters(self):
    102         import unicodedata
    103         count = 0
    104         for code in xrange(0x10000):
    105             char = unichr(code)
    106             name = unicodedata.name(char, None)
    107             if name is not None:
    108                 self.assertEqual(unicodedata.lookup(name), char)
    109                 count += 1
    110 
    111     def test_misc_symbols(self):
    112         self.checkletter("PILCROW SIGN", u"\u00b6")
    113         self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
    114         self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
    115         self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
    116 
    117     def test_errors(self):
    118         import unicodedata
    119         self.assertRaises(TypeError, unicodedata.name)
    120         self.assertRaises(TypeError, unicodedata.name, u'xx')
    121         self.assertRaises(TypeError, unicodedata.lookup)
    122         self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
    123 
    124     def test_strict_eror_handling(self):
    125         # bogus character name
    126         self.assertRaises(
    127             UnicodeError,
    128             unicode, "\\N{blah}", 'unicode-escape', 'strict'
    129         )
    130         # long bogus character name
    131         self.assertRaises(
    132             UnicodeError,
    133             unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
    134         )
    135         # missing closing brace
    136         self.assertRaises(
    137             UnicodeError,
    138             unicode, "\\N{SPACE", 'unicode-escape', 'strict'
    139         )
    140         # missing opening brace
    141         self.assertRaises(
    142             UnicodeError,
    143             unicode, "\\NSPACE", 'unicode-escape', 'strict'
    144         )
    145 
    146     @test_support.cpython_only
    147     @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
    148     @unittest.skipUnless(UINT_MAX < sys.maxint, "needs UINT_MAX < sys.maxint")
    149     @test_support.bigmemtest(minsize=UINT_MAX + 1,
    150                              memuse=2 + 4 // len(u'\U00010000'))
    151     def test_issue16335(self, size):
    152         func = self.test_issue16335
    153         if size < func.minsize:
    154             raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
    155                     (func.minsize * func.memuse / float(1024**3),))
    156         # very very long bogus character name
    157         x = b'\\N{SPACE' + b'x' * int(UINT_MAX + 1) + b'}'
    158         self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
    159         self.assertRaisesRegexp(UnicodeError,
    160             'unknown Unicode character name',
    161             x.decode, 'unicode-escape'
    162         )
    163 
    164 
    165 def test_main():
    166     test_support.run_unittest(UnicodeNamesTest)
    167 
    168 if __name__ == "__main__":
    169     test_main()
    170