Home | History | Annotate | Download | only in test
      1 """ Test script for the Unicode implementation.
      2 
      3 Written by Bill Tutt.
      4 Modified for Python 2.0 by Fredrik Lundh (fredrik (at] pythonware.com)
      5 
      6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      7 
      8 """#"
      9 
     10 import unittest
     11 import unicodedata
     12 
     13 from test import support
     14 from http.client import HTTPException
     15 from test.test_normalization import check_version
     16 
     17 try:
     18     from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
     19 except ImportError:
     20     INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
     21 
     22 class UnicodeNamesTest(unittest.TestCase):
     23 
     24     def checkletter(self, name, code):
     25         # Helper that put all \N escapes inside eval'd raw strings,
     26         # to make sure this script runs even if the compiler
     27         # chokes on \N escapes
     28         res = eval(r'"\N{%s}"' % name)
     29         self.assertEqual(res, code)
     30         return res
     31 
     32     def test_general(self):
     33         # General and case insensitivity test:
     34         chars = [
     35             "LATIN CAPITAL LETTER T",
     36             "LATIN SMALL LETTER H",
     37             "LATIN SMALL LETTER E",
     38             "SPACE",
     39             "LATIN SMALL LETTER R",
     40             "LATIN CAPITAL LETTER E",
     41             "LATIN SMALL LETTER D",
     42             "SPACE",
     43             "LATIN SMALL LETTER f",
     44             "LATIN CAPITAL LeTtEr o",
     45             "LATIN SMaLl LETTER x",
     46             "SPACE",
     47             "LATIN SMALL LETTER A",
     48             "LATIN SMALL LETTER T",
     49             "LATIN SMALL LETTER E",
     50             "SPACE",
     51             "LATIN SMALL LETTER T",
     52             "LATIN SMALL LETTER H",
     53             "LATIN SMALL LETTER E",
     54             "SpAcE",
     55             "LATIN SMALL LETTER S",
     56             "LATIN SMALL LETTER H",
     57             "LATIN small LETTER e",
     58             "LATIN small LETTER e",
     59             "LATIN SMALL LETTER P",
     60             "FULL STOP"
     61         ]
     62         string = "The rEd fOx ate the sheep."
     63 
     64         self.assertEqual(
     65             "".join([self.checkletter(*args) for args in zip(chars, string)]),
     66             string
     67         )
     68 
     69     def test_ascii_letters(self):
     70         for char in "".join(map(chr, range(ord("a"), ord("z")))):
     71             name = "LATIN SMALL LETTER %s" % char.upper()
     72             code = unicodedata.lookup(name)
     73             self.assertEqual(unicodedata.name(code), name)
     74 
     75     def test_hangul_syllables(self):
     76         self.checkletter("HANGUL SYLLABLE GA", "\uac00")
     77         self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
     78         self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
     79         self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
     80         self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
     81         self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
     82         self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
     83         self.checkletter("HANGUL SYLLABLE YI", "\uc758")
     84         self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
     85         self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
     86         self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
     87         self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
     88         self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
     89 
     90         self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
     91 
     92     def test_cjk_unified_ideographs(self):
     93         self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
     94         self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
     95         self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
     96         self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
     97         self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
     98         self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
     99         self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
    100         self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
    101         self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
    102         self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
    103 
    104     def test_bmp_characters(self):
    105         for code in range(0x10000):
    106             char = chr(code)
    107             name = unicodedata.name(char, None)
    108             if name is not None:
    109                 self.assertEqual(unicodedata.lookup(name), char)
    110 
    111     def test_misc_symbols(self):
    112         self.checkletter("PILCROW SIGN", "\u00b6")
    113         self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
    114         self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
    115         self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
    116 
    117     def test_aliases(self):
    118         # Check that the aliases defined in the NameAliases.txt file work.
    119         # This should be updated when new aliases are added or the file
    120         # should be downloaded and parsed instead.  See #12753.
    121         aliases = [
    122             ('LATIN CAPITAL LETTER GHA', 0x01A2),
    123             ('LATIN SMALL LETTER GHA', 0x01A3),
    124             ('KANNADA LETTER LLLA', 0x0CDE),
    125             ('LAO LETTER FO FON', 0x0E9D),
    126             ('LAO LETTER FO FAY', 0x0E9F),
    127             ('LAO LETTER RO', 0x0EA3),
    128             ('LAO LETTER LO', 0x0EA5),
    129             ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
    130             ('YI SYLLABLE ITERATION MARK', 0xA015),
    131             ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
    132             ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
    133         ]
    134         for alias, codepoint in aliases:
    135             self.checkletter(alias, chr(codepoint))
    136             name = unicodedata.name(chr(codepoint))
    137             self.assertNotEqual(name, alias)
    138             self.assertEqual(unicodedata.lookup(alias),
    139                              unicodedata.lookup(name))
    140             with self.assertRaises(KeyError):
    141                 unicodedata.ucd_3_2_0.lookup(alias)
    142 
    143     def test_aliases_names_in_pua_range(self):
    144         # We are storing aliases in the PUA 15, but their names shouldn't leak
    145         for cp in range(0xf0000, 0xf0100):
    146             with self.assertRaises(ValueError) as cm:
    147                 unicodedata.name(chr(cp))
    148             self.assertEqual(str(cm.exception), 'no such name')
    149 
    150     def test_named_sequences_names_in_pua_range(self):
    151         # We are storing named seq in the PUA 15, but their names shouldn't leak
    152         for cp in range(0xf0100, 0xf0fff):
    153             with self.assertRaises(ValueError) as cm:
    154                 unicodedata.name(chr(cp))
    155             self.assertEqual(str(cm.exception), 'no such name')
    156 
    157     def test_named_sequences_sample(self):
    158         # Check a few named sequences.  See #12753.
    159         sequences = [
    160             ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
    161             ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
    162             ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
    163             ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
    164             ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
    165         ]
    166         for seqname, codepoints in sequences:
    167             self.assertEqual(unicodedata.lookup(seqname), codepoints)
    168             with self.assertRaises(SyntaxError):
    169                 self.checkletter(seqname, None)
    170             with self.assertRaises(KeyError):
    171                 unicodedata.ucd_3_2_0.lookup(seqname)
    172 
    173     def test_named_sequences_full(self):
    174         # Check all the named sequences
    175         url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
    176                unicodedata.unidata_version)
    177         try:
    178             testdata = support.open_urlresource(url, encoding="utf-8",
    179                                                 check=check_version)
    180         except (OSError, HTTPException):
    181             self.skipTest("Could not retrieve " + url)
    182         self.addCleanup(testdata.close)
    183         for line in testdata:
    184             line = line.strip()
    185             if not line or line.startswith('#'):
    186                 continue
    187             seqname, codepoints = line.split(';')
    188             codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
    189             self.assertEqual(unicodedata.lookup(seqname), codepoints)
    190             with self.assertRaises(SyntaxError):
    191                 self.checkletter(seqname, None)
    192             with self.assertRaises(KeyError):
    193                 unicodedata.ucd_3_2_0.lookup(seqname)
    194 
    195     def test_errors(self):
    196         self.assertRaises(TypeError, unicodedata.name)
    197         self.assertRaises(TypeError, unicodedata.name, 'xx')
    198         self.assertRaises(TypeError, unicodedata.lookup)
    199         self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
    200 
    201     def test_strict_error_handling(self):
    202         # bogus character name
    203         self.assertRaises(
    204             UnicodeError,
    205             str, b"\\N{blah}", 'unicode-escape', 'strict'
    206         )
    207         # long bogus character name
    208         self.assertRaises(
    209             UnicodeError,
    210             str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
    211         )
    212         # missing closing brace
    213         self.assertRaises(
    214             UnicodeError,
    215             str, b"\\N{SPACE", 'unicode-escape', 'strict'
    216         )
    217         # missing opening brace
    218         self.assertRaises(
    219             UnicodeError,
    220             str, b"\\NSPACE", 'unicode-escape', 'strict'
    221         )
    222 
    223     @support.cpython_only
    224     @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
    225     @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
    226     def test_issue16335(self, size):
    227         # very very long bogus character name
    228         x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
    229         self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
    230         self.assertRaisesRegex(UnicodeError,
    231             'unknown Unicode character name',
    232             x.decode, 'unicode-escape'
    233         )
    234 
    235 
    236 if __name__ == "__main__":
    237     unittest.main()
    238