Home | History | Annotate | Download | only in test
      1 """ Test script for the unicodedata module.
      2 
      3     Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      4 
      5     (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      6 
      7 """
      8 
      9 import sys
     10 import unittest
     11 import hashlib
     12 import subprocess
     13 import test.test_support
     14 
     15 encoding = 'utf-8'
     16 
     17 
     18 ### Run tests
     19 
     20 class UnicodeMethodsTest(unittest.TestCase):
     21 
     22     # update this, if the database changes
     23     expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
     24 
     25     def test_method_checksum(self):
     26         h = hashlib.sha1()
     27         for i in range(0x10000):
     28             char = unichr(i)
     29             data = [
     30                 # Predicates (single char)
     31                 u"01"[char.isalnum()],
     32                 u"01"[char.isalpha()],
     33                 u"01"[char.isdecimal()],
     34                 u"01"[char.isdigit()],
     35                 u"01"[char.islower()],
     36                 u"01"[char.isnumeric()],
     37                 u"01"[char.isspace()],
     38                 u"01"[char.istitle()],
     39                 u"01"[char.isupper()],
     40 
     41                 # Predicates (multiple chars)
     42                 u"01"[(char + u'abc').isalnum()],
     43                 u"01"[(char + u'abc').isalpha()],
     44                 u"01"[(char + u'123').isdecimal()],
     45                 u"01"[(char + u'123').isdigit()],
     46                 u"01"[(char + u'abc').islower()],
     47                 u"01"[(char + u'123').isnumeric()],
     48                 u"01"[(char + u' \t').isspace()],
     49                 u"01"[(char + u'abc').istitle()],
     50                 u"01"[(char + u'ABC').isupper()],
     51 
     52                 # Mappings (single char)
     53                 char.lower(),
     54                 char.upper(),
     55                 char.title(),
     56 
     57                 # Mappings (multiple chars)
     58                 (char + u'abc').lower(),
     59                 (char + u'ABC').upper(),
     60                 (char + u'abc').title(),
     61                 (char + u'ABC').title(),
     62 
     63                 ]
     64             h.update(u''.join(data).encode(encoding))
     65         result = h.hexdigest()
     66         self.assertEqual(result, self.expectedchecksum)
     67 
     68 class UnicodeDatabaseTest(unittest.TestCase):
     69 
     70     def setUp(self):
     71         # In case unicodedata is not available, this will raise an ImportError,
     72         # but the other test cases will still be run
     73         import unicodedata
     74         self.db = unicodedata
     75 
     76     def tearDown(self):
     77         del self.db
     78 
     79 class UnicodeFunctionsTest(UnicodeDatabaseTest):
     80 
     81     # update this, if the database changes
     82     expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
     83 
     84     def test_function_checksum(self):
     85         data = []
     86         h = hashlib.sha1()
     87 
     88         for i in range(0x10000):
     89             char = unichr(i)
     90             data = [
     91                 # Properties
     92                 str(self.db.digit(char, -1)),
     93                 str(self.db.numeric(char, -1)),
     94                 str(self.db.decimal(char, -1)),
     95                 self.db.category(char),
     96                 self.db.bidirectional(char),
     97                 self.db.decomposition(char),
     98                 str(self.db.mirrored(char)),
     99                 str(self.db.combining(char)),
    100             ]
    101             h.update(''.join(data))
    102         result = h.hexdigest()
    103         self.assertEqual(result, self.expectedchecksum)
    104 
    105     def test_digit(self):
    106         self.assertEqual(self.db.digit(u'A', None), None)
    107         self.assertEqual(self.db.digit(u'9'), 9)
    108         self.assertEqual(self.db.digit(u'\u215b', None), None)
    109         self.assertEqual(self.db.digit(u'\u2468'), 9)
    110         self.assertEqual(self.db.digit(u'\U00020000', None), None)
    111 
    112         self.assertRaises(TypeError, self.db.digit)
    113         self.assertRaises(TypeError, self.db.digit, u'xx')
    114         self.assertRaises(ValueError, self.db.digit, u'x')
    115 
    116     def test_numeric(self):
    117         self.assertEqual(self.db.numeric(u'A',None), None)
    118         self.assertEqual(self.db.numeric(u'9'), 9)
    119         self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
    120         self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
    121         self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
    122         self.assertEqual(self.db.numeric(u'\U00020000', None), None)
    123 
    124         self.assertRaises(TypeError, self.db.numeric)
    125         self.assertRaises(TypeError, self.db.numeric, u'xx')
    126         self.assertRaises(ValueError, self.db.numeric, u'x')
    127 
    128     def test_decimal(self):
    129         self.assertEqual(self.db.decimal(u'A',None), None)
    130         self.assertEqual(self.db.decimal(u'9'), 9)
    131         self.assertEqual(self.db.decimal(u'\u215b', None), None)
    132         self.assertEqual(self.db.decimal(u'\u2468', None), None)
    133         self.assertEqual(self.db.decimal(u'\U00020000', None), None)
    134 
    135         self.assertRaises(TypeError, self.db.decimal)
    136         self.assertRaises(TypeError, self.db.decimal, u'xx')
    137         self.assertRaises(ValueError, self.db.decimal, u'x')
    138 
    139     def test_category(self):
    140         self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
    141         self.assertEqual(self.db.category(u'a'), 'Ll')
    142         self.assertEqual(self.db.category(u'A'), 'Lu')
    143         self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
    144 
    145         self.assertRaises(TypeError, self.db.category)
    146         self.assertRaises(TypeError, self.db.category, u'xx')
    147 
    148     def test_bidirectional(self):
    149         self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
    150         self.assertEqual(self.db.bidirectional(u' '), 'WS')
    151         self.assertEqual(self.db.bidirectional(u'A'), 'L')
    152         self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
    153 
    154         self.assertRaises(TypeError, self.db.bidirectional)
    155         self.assertRaises(TypeError, self.db.bidirectional, u'xx')
    156 
    157     def test_decomposition(self):
    158         self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
    159         self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
    160 
    161         self.assertRaises(TypeError, self.db.decomposition)
    162         self.assertRaises(TypeError, self.db.decomposition, u'xx')
    163 
    164     def test_mirrored(self):
    165         self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
    166         self.assertEqual(self.db.mirrored(u'a'), 0)
    167         self.assertEqual(self.db.mirrored(u'\u2201'), 1)
    168         self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
    169 
    170         self.assertRaises(TypeError, self.db.mirrored)
    171         self.assertRaises(TypeError, self.db.mirrored, u'xx')
    172 
    173     def test_combining(self):
    174         self.assertEqual(self.db.combining(u'\uFFFE'), 0)
    175         self.assertEqual(self.db.combining(u'a'), 0)
    176         self.assertEqual(self.db.combining(u'\u20e1'), 230)
    177         self.assertEqual(self.db.combining(u'\U00020000'), 0)
    178 
    179         self.assertRaises(TypeError, self.db.combining)
    180         self.assertRaises(TypeError, self.db.combining, u'xx')
    181 
    182     def test_normalize(self):
    183         self.assertRaises(TypeError, self.db.normalize)
    184         self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
    185         self.assertEqual(self.db.normalize('NFKC', u''), u'')
    186         # The rest can be found in test_normalization.py
    187         # which requires an external file.
    188 
    189     def test_pr29(self):
    190         # http://www.unicode.org/review/pr-29.html
    191         # See issues #1054943 and #10254.
    192         composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
    193                     u'Li\u030dt-s\u1e73\u0301',
    194                     u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
    195                     + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
    196                     u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
    197                     + 'u\u0938\u094d\u0924\u093e\u0928')
    198         for text in composed:
    199             self.assertEqual(self.db.normalize('NFC', text), text)
    200 
    201     def test_issue10254(self):
    202         # Crash reported in #10254
    203         a = u'C\u0338' * 20  + u'C\u0327'
    204         b = u'C\u0338' * 20  + u'\xC7'
    205         self.assertEqual(self.db.normalize('NFC', a), b)
    206 
    207     def test_issue29456(self):
    208         # Fix #29456
    209         u1176_str_a = u'\u1100\u1176\u11a8'
    210         u1176_str_b = u'\u1100\u1176\u11a8'
    211         u11a7_str_a = u'\u1100\u1175\u11a7'
    212         u11a7_str_b = u'\uae30\u11a7'
    213         u11c3_str_a = u'\u1100\u1175\u11c3'
    214         u11c3_str_b = u'\uae30\u11c3'
    215         self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
    216         self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
    217         self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
    218 
    219 
    220     def test_east_asian_width(self):
    221         eaw = self.db.east_asian_width
    222         self.assertRaises(TypeError, eaw, 'a')
    223         self.assertRaises(TypeError, eaw, u'')
    224         self.assertRaises(TypeError, eaw, u'ra')
    225         self.assertEqual(eaw(u'\x1e'), 'N')
    226         self.assertEqual(eaw(u'\x20'), 'Na')
    227         self.assertEqual(eaw(u'\uC894'), 'W')
    228         self.assertEqual(eaw(u'\uFF66'), 'H')
    229         self.assertEqual(eaw(u'\uFF1F'), 'F')
    230         self.assertEqual(eaw(u'\u2010'), 'A')
    231         self.assertEqual(eaw(u'\U00020000'), 'W')
    232 
    233 class UnicodeMiscTest(UnicodeDatabaseTest):
    234 
    235     def test_failed_import_during_compiling(self):
    236         # Issue 4367
    237         # Decoding \N escapes requires the unicodedata module. If it can't be
    238         # imported, we shouldn't segfault.
    239 
    240         # This program should raise a SyntaxError in the eval.
    241         code = "import sys;" \
    242             "sys.modules['unicodedata'] = None;" \
    243             """eval("u'\N{SOFT HYPHEN}'")"""
    244         args = [sys.executable, "-c", code]
    245         # We use a subprocess because the unicodedata module may already have
    246         # been loaded in this process.
    247         popen = subprocess.Popen(args, stderr=subprocess.PIPE)
    248         popen.wait()
    249         self.assertEqual(popen.returncode, 1)
    250         error = "SyntaxError: (unicode error) \N escapes not supported " \
    251             "(can't load unicodedata module)"
    252         self.assertIn(error, popen.stderr.read())
    253 
    254     def test_decimal_numeric_consistent(self):
    255         # Test that decimal and numeric are consistent,
    256         # i.e. if a character has a decimal value,
    257         # its numeric value should be the same.
    258         count = 0
    259         for i in xrange(0x10000):
    260             c = unichr(i)
    261             dec = self.db.decimal(c, -1)
    262             if dec != -1:
    263                 self.assertEqual(dec, self.db.numeric(c))
    264                 count += 1
    265         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    266 
    267     def test_digit_numeric_consistent(self):
    268         # Test that digit and numeric are consistent,
    269         # i.e. if a character has a digit value,
    270         # its numeric value should be the same.
    271         count = 0
    272         for i in xrange(0x10000):
    273             c = unichr(i)
    274             dec = self.db.digit(c, -1)
    275             if dec != -1:
    276                 self.assertEqual(dec, self.db.numeric(c))
    277                 count += 1
    278         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    279 
    280     def test_bug_1704793(self):
    281         self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
    282 
    283     def test_ucd_510(self):
    284         import unicodedata
    285         # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
    286         self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
    287         self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
    288         # Also, we now have two ways of representing
    289         # the upper-case mapping: as delta, or as absolute value
    290         self.assertTrue(u"a".upper()==u'A')
    291         self.assertTrue(u"\u1d79".upper()==u'\ua77d')
    292         self.assertTrue(u".".upper()==u".")
    293 
    294     def test_bug_5828(self):
    295         self.assertEqual(u"\u1d79".lower(), u"\u1d79")
    296         # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
    297         self.assertEqual(
    298             [
    299                 c for c in range(sys.maxunicode+1)
    300                 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
    301             ],
    302             [0]
    303         )
    304 
    305     def test_bug_4971(self):
    306         # LETTER DZ WITH CARON: DZ, Dz, dz
    307         self.assertEqual(u"\u01c4".title(), u"\u01c5")
    308         self.assertEqual(u"\u01c5".title(), u"\u01c5")
    309         self.assertEqual(u"\u01c6".title(), u"\u01c5")
    310 
    311     def test_linebreak_7643(self):
    312         for i in range(0x10000):
    313             lines = (unichr(i) + u'A').splitlines()
    314             if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
    315                      0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
    316                 self.assertEqual(len(lines), 2,
    317                                  r"\u%.4x should be a linebreak" % i)
    318             else:
    319                 self.assertEqual(len(lines), 1,
    320                                  r"\u%.4x should not be a linebreak" % i)
    321 
    322 def test_main():
    323     test.test_support.run_unittest(
    324         UnicodeMiscTest,
    325         UnicodeMethodsTest,
    326         UnicodeFunctionsTest
    327     )
    328 
    329 if __name__ == "__main__":
    330     test_main()
    331