Home | History | Annotate | Download | only in test
      1 """ Test script for the unicodedata module.
      2 
      3     Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      4 
      5     (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      6 
      7 """
      8 
      9 import sys
     10 import unittest
     11 import hashlib
     12 from test.support import script_helper
     13 
     14 encoding = 'utf-8'
     15 errors = 'surrogatepass'
     16 
     17 
     18 ### Run tests
     19 
     20 class UnicodeMethodsTest(unittest.TestCase):
     21 
     22     # update this, if the database changes
     23     expectedchecksum = 'c1fa98674a683aa8a8d8dee0c84494f8d36346e6'
     24 
     25     def test_method_checksum(self):
     26         h = hashlib.sha1()
     27         for i in range(0x10000):
     28             char = chr(i)
     29             data = [
     30                 # Predicates (single char)
     31                 "01"[char.isalnum()],
     32                 "01"[char.isalpha()],
     33                 "01"[char.isdecimal()],
     34                 "01"[char.isdigit()],
     35                 "01"[char.islower()],
     36                 "01"[char.isnumeric()],
     37                 "01"[char.isspace()],
     38                 "01"[char.istitle()],
     39                 "01"[char.isupper()],
     40 
     41                 # Predicates (multiple chars)
     42                 "01"[(char + 'abc').isalnum()],
     43                 "01"[(char + 'abc').isalpha()],
     44                 "01"[(char + '123').isdecimal()],
     45                 "01"[(char + '123').isdigit()],
     46                 "01"[(char + 'abc').islower()],
     47                 "01"[(char + '123').isnumeric()],
     48                 "01"[(char + ' \t').isspace()],
     49                 "01"[(char + 'abc').istitle()],
     50                 "01"[(char + 'ABC').isupper()],
     51 
     52                 # Mappings (single char)
     53                 char.lower(),
     54                 char.upper(),
     55                 char.title(),
     56 
     57                 # Mappings (multiple chars)
     58                 (char + 'abc').lower(),
     59                 (char + 'ABC').upper(),
     60                 (char + 'abc').title(),
     61                 (char + 'ABC').title(),
     62 
     63                 ]
     64             h.update(''.join(data).encode(encoding, errors))
     65         result = h.hexdigest()
     66         self.assertEqual(result, self.expectedchecksum)
     67 
     68 class UnicodeDatabaseTest(unittest.TestCase):
     69 
     70     def setUp(self):
     71         # In case unicodedata is not available, this will raise an ImportError,
     72         # but the other test cases will still be run
     73         import unicodedata
     74         self.db = unicodedata
     75 
     76     def tearDown(self):
     77         del self.db
     78 
     79 class UnicodeFunctionsTest(UnicodeDatabaseTest):
     80 
     81     # Update this if the database changes. Make sure to do a full rebuild
     82     # (e.g. 'make distclean && make') to get the correct checksum.
     83     expectedchecksum = 'f891b1e6430c712531b9bc935a38e22d78ba1bf3'
     84     def test_function_checksum(self):
     85         data = []
     86         h = hashlib.sha1()
     87 
     88         for i in range(0x10000):
     89             char = chr(i)
     90             data = [
     91                 # Properties
     92                 format(self.db.digit(char, -1), '.12g'),
     93                 format(self.db.numeric(char, -1), '.12g'),
     94                 format(self.db.decimal(char, -1), '.12g'),
     95                 self.db.category(char),
     96                 self.db.bidirectional(char),
     97                 self.db.decomposition(char),
     98                 str(self.db.mirrored(char)),
     99                 str(self.db.combining(char)),
    100             ]
    101             h.update(''.join(data).encode("ascii"))
    102         result = h.hexdigest()
    103         self.assertEqual(result, self.expectedchecksum)
    104 
    105     def test_digit(self):
    106         self.assertEqual(self.db.digit('A', None), None)
    107         self.assertEqual(self.db.digit('9'), 9)
    108         self.assertEqual(self.db.digit('\u215b', None), None)
    109         self.assertEqual(self.db.digit('\u2468'), 9)
    110         self.assertEqual(self.db.digit('\U00020000', None), None)
    111         self.assertEqual(self.db.digit('\U0001D7FD'), 7)
    112 
    113         self.assertRaises(TypeError, self.db.digit)
    114         self.assertRaises(TypeError, self.db.digit, 'xx')
    115         self.assertRaises(ValueError, self.db.digit, 'x')
    116 
    117     def test_numeric(self):
    118         self.assertEqual(self.db.numeric('A',None), None)
    119         self.assertEqual(self.db.numeric('9'), 9)
    120         self.assertEqual(self.db.numeric('\u215b'), 0.125)
    121         self.assertEqual(self.db.numeric('\u2468'), 9.0)
    122         self.assertEqual(self.db.numeric('\ua627'), 7.0)
    123         self.assertEqual(self.db.numeric('\U00020000', None), None)
    124         self.assertEqual(self.db.numeric('\U0001012A'), 9000)
    125 
    126         self.assertRaises(TypeError, self.db.numeric)
    127         self.assertRaises(TypeError, self.db.numeric, 'xx')
    128         self.assertRaises(ValueError, self.db.numeric, 'x')
    129 
    130     def test_decimal(self):
    131         self.assertEqual(self.db.decimal('A',None), None)
    132         self.assertEqual(self.db.decimal('9'), 9)
    133         self.assertEqual(self.db.decimal('\u215b', None), None)
    134         self.assertEqual(self.db.decimal('\u2468', None), None)
    135         self.assertEqual(self.db.decimal('\U00020000', None), None)
    136         self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
    137 
    138         self.assertRaises(TypeError, self.db.decimal)
    139         self.assertRaises(TypeError, self.db.decimal, 'xx')
    140         self.assertRaises(ValueError, self.db.decimal, 'x')
    141 
    142     def test_category(self):
    143         self.assertEqual(self.db.category('\uFFFE'), 'Cn')
    144         self.assertEqual(self.db.category('a'), 'Ll')
    145         self.assertEqual(self.db.category('A'), 'Lu')
    146         self.assertEqual(self.db.category('\U00020000'), 'Lo')
    147         self.assertEqual(self.db.category('\U0001012A'), 'No')
    148 
    149         self.assertRaises(TypeError, self.db.category)
    150         self.assertRaises(TypeError, self.db.category, 'xx')
    151 
    152     def test_bidirectional(self):
    153         self.assertEqual(self.db.bidirectional('\uFFFE'), '')
    154         self.assertEqual(self.db.bidirectional(' '), 'WS')
    155         self.assertEqual(self.db.bidirectional('A'), 'L')
    156         self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
    157 
    158         self.assertRaises(TypeError, self.db.bidirectional)
    159         self.assertRaises(TypeError, self.db.bidirectional, 'xx')
    160 
    161     def test_decomposition(self):
    162         self.assertEqual(self.db.decomposition('\uFFFE'),'')
    163         self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
    164 
    165         self.assertRaises(TypeError, self.db.decomposition)
    166         self.assertRaises(TypeError, self.db.decomposition, 'xx')
    167 
    168     def test_mirrored(self):
    169         self.assertEqual(self.db.mirrored('\uFFFE'), 0)
    170         self.assertEqual(self.db.mirrored('a'), 0)
    171         self.assertEqual(self.db.mirrored('\u2201'), 1)
    172         self.assertEqual(self.db.mirrored('\U00020000'), 0)
    173 
    174         self.assertRaises(TypeError, self.db.mirrored)
    175         self.assertRaises(TypeError, self.db.mirrored, 'xx')
    176 
    177     def test_combining(self):
    178         self.assertEqual(self.db.combining('\uFFFE'), 0)
    179         self.assertEqual(self.db.combining('a'), 0)
    180         self.assertEqual(self.db.combining('\u20e1'), 230)
    181         self.assertEqual(self.db.combining('\U00020000'), 0)
    182 
    183         self.assertRaises(TypeError, self.db.combining)
    184         self.assertRaises(TypeError, self.db.combining, 'xx')
    185 
    186     def test_normalize(self):
    187         self.assertRaises(TypeError, self.db.normalize)
    188         self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
    189         self.assertEqual(self.db.normalize('NFKC', ''), '')
    190         # The rest can be found in test_normalization.py
    191         # which requires an external file.
    192 
    193     def test_pr29(self):
    194         # http://www.unicode.org/review/pr-29.html
    195         # See issues #1054943 and #10254.
    196         composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
    197                     'Li\u030dt-s\u1e73\u0301',
    198                     '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
    199                     + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
    200                     '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
    201                     + '\u0938\u094d\u0924\u093e\u0928')
    202         for text in composed:
    203             self.assertEqual(self.db.normalize('NFC', text), text)
    204 
    205     def test_issue10254(self):
    206         # Crash reported in #10254
    207         a = 'C\u0338' * 20  + 'C\u0327'
    208         b = 'C\u0338' * 20  + '\xC7'
    209         self.assertEqual(self.db.normalize('NFC', a), b)
    210 
    211     def test_east_asian_width(self):
    212         eaw = self.db.east_asian_width
    213         self.assertRaises(TypeError, eaw, b'a')
    214         self.assertRaises(TypeError, eaw, bytearray())
    215         self.assertRaises(TypeError, eaw, '')
    216         self.assertRaises(TypeError, eaw, 'ra')
    217         self.assertEqual(eaw('\x1e'), 'N')
    218         self.assertEqual(eaw('\x20'), 'Na')
    219         self.assertEqual(eaw('\uC894'), 'W')
    220         self.assertEqual(eaw('\uFF66'), 'H')
    221         self.assertEqual(eaw('\uFF1F'), 'F')
    222         self.assertEqual(eaw('\u2010'), 'A')
    223         self.assertEqual(eaw('\U00020000'), 'W')
    224 
    225     def test_east_asian_width_9_0_changes(self):
    226         self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
    227         self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
    228 
    229 class UnicodeMiscTest(UnicodeDatabaseTest):
    230 
    231     def test_failed_import_during_compiling(self):
    232         # Issue 4367
    233         # Decoding \N escapes requires the unicodedata module. If it can't be
    234         # imported, we shouldn't segfault.
    235 
    236         # This program should raise a SyntaxError in the eval.
    237         code = "import sys;" \
    238             "sys.modules['unicodedata'] = None;" \
    239             """eval("'\\\\N{SOFT HYPHEN}'")"""
    240         # We use a separate process because the unicodedata module may already
    241         # have been loaded in this process.
    242         result = script_helper.assert_python_failure("-c", code)
    243         error = "SyntaxError: (unicode error) \\N escapes not supported " \
    244             "(can't load unicodedata module)"
    245         self.assertIn(error, result.err.decode("ascii"))
    246 
    247     def test_decimal_numeric_consistent(self):
    248         # Test that decimal and numeric are consistent,
    249         # i.e. if a character has a decimal value,
    250         # its numeric value should be the same.
    251         count = 0
    252         for i in range(0x10000):
    253             c = chr(i)
    254             dec = self.db.decimal(c, -1)
    255             if dec != -1:
    256                 self.assertEqual(dec, self.db.numeric(c))
    257                 count += 1
    258         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    259 
    260     def test_digit_numeric_consistent(self):
    261         # Test that digit and numeric are consistent,
    262         # i.e. if a character has a digit value,
    263         # its numeric value should be the same.
    264         count = 0
    265         for i in range(0x10000):
    266             c = chr(i)
    267             dec = self.db.digit(c, -1)
    268             if dec != -1:
    269                 self.assertEqual(dec, self.db.numeric(c))
    270                 count += 1
    271         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    272 
    273     def test_bug_1704793(self):
    274         self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
    275 
    276     def test_ucd_510(self):
    277         import unicodedata
    278         # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
    279         self.assertTrue(unicodedata.mirrored("\u0f3a"))
    280         self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
    281         # Also, we now have two ways of representing
    282         # the upper-case mapping: as delta, or as absolute value
    283         self.assertTrue("a".upper()=='A')
    284         self.assertTrue("\u1d79".upper()=='\ua77d')
    285         self.assertTrue(".".upper()=='.')
    286 
    287     def test_bug_5828(self):
    288         self.assertEqual("\u1d79".lower(), "\u1d79")
    289         # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
    290         self.assertEqual(
    291             [
    292                 c for c in range(sys.maxunicode+1)
    293                 if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
    294             ],
    295             [0]
    296         )
    297 
    298     def test_bug_4971(self):
    299         # LETTER DZ WITH CARON: DZ, Dz, dz
    300         self.assertEqual("\u01c4".title(), "\u01c5")
    301         self.assertEqual("\u01c5".title(), "\u01c5")
    302         self.assertEqual("\u01c6".title(), "\u01c5")
    303 
    304     def test_linebreak_7643(self):
    305         for i in range(0x10000):
    306             lines = (chr(i) + 'A').splitlines()
    307             if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
    308                      0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
    309                 self.assertEqual(len(lines), 2,
    310                                  r"\u%.4x should be a linebreak" % i)
    311             else:
    312                 self.assertEqual(len(lines), 1,
    313                                  r"\u%.4x should not be a linebreak" % i)
    314 
    315 if __name__ == "__main__":
    316     unittest.main()
    317