Home | History | Annotate | Download | only in test
      1 """ Test script for the unicodedata module.
      2 
      3     Written by Marc-Andre Lemburg (mal (at] lemburg.com).
      4 
      5     (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      6 
      7 """
      8 
      9 import sys
     10 import unittest
     11 import hashlib
     12 import subprocess
     13 import test.test_support
     14 
     15 encoding = 'utf-8'
     16 
     17 
     18 ### Run tests
     19 
     20 class UnicodeMethodsTest(unittest.TestCase):
     21 
     22     # update this, if the database changes
     23     expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
     24 
     25     def test_method_checksum(self):
     26         h = hashlib.sha1()
     27         for i in range(0x10000):
     28             char = unichr(i)
     29             data = [
     30                 # Predicates (single char)
     31                 u"01"[char.isalnum()],
     32                 u"01"[char.isalpha()],
     33                 u"01"[char.isdecimal()],
     34                 u"01"[char.isdigit()],
     35                 u"01"[char.islower()],
     36                 u"01"[char.isnumeric()],
     37                 u"01"[char.isspace()],
     38                 u"01"[char.istitle()],
     39                 u"01"[char.isupper()],
     40 
     41                 # Predicates (multiple chars)
     42                 u"01"[(char + u'abc').isalnum()],
     43                 u"01"[(char + u'abc').isalpha()],
     44                 u"01"[(char + u'123').isdecimal()],
     45                 u"01"[(char + u'123').isdigit()],
     46                 u"01"[(char + u'abc').islower()],
     47                 u"01"[(char + u'123').isnumeric()],
     48                 u"01"[(char + u' \t').isspace()],
     49                 u"01"[(char + u'abc').istitle()],
     50                 u"01"[(char + u'ABC').isupper()],
     51 
     52                 # Mappings (single char)
     53                 char.lower(),
     54                 char.upper(),
     55                 char.title(),
     56 
     57                 # Mappings (multiple chars)
     58                 (char + u'abc').lower(),
     59                 (char + u'ABC').upper(),
     60                 (char + u'abc').title(),
     61                 (char + u'ABC').title(),
     62 
     63                 ]
     64             h.update(u''.join(data).encode(encoding))
     65         result = h.hexdigest()
     66         self.assertEqual(result, self.expectedchecksum)
     67 
     68 class UnicodeDatabaseTest(unittest.TestCase):
     69 
     70     def setUp(self):
     71         # In case unicodedata is not available, this will raise an ImportError,
     72         # but the other test cases will still be run
     73         import unicodedata
     74         self.db = unicodedata
     75 
     76     def tearDown(self):
     77         del self.db
     78 
     79 class UnicodeFunctionsTest(UnicodeDatabaseTest):
     80 
     81     # update this, if the database changes
     82     expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
     83 
     84     def test_function_checksum(self):
     85         data = []
     86         h = hashlib.sha1()
     87 
     88         for i in range(0x10000):
     89             char = unichr(i)
     90             data = [
     91                 # Properties
     92                 str(self.db.digit(char, -1)),
     93                 str(self.db.numeric(char, -1)),
     94                 str(self.db.decimal(char, -1)),
     95                 self.db.category(char),
     96                 self.db.bidirectional(char),
     97                 self.db.decomposition(char),
     98                 str(self.db.mirrored(char)),
     99                 str(self.db.combining(char)),
    100             ]
    101             h.update(''.join(data))
    102         result = h.hexdigest()
    103         self.assertEqual(result, self.expectedchecksum)
    104 
    105     def test_digit(self):
    106         self.assertEqual(self.db.digit(u'A', None), None)
    107         self.assertEqual(self.db.digit(u'9'), 9)
    108         self.assertEqual(self.db.digit(u'\u215b', None), None)
    109         self.assertEqual(self.db.digit(u'\u2468'), 9)
    110         self.assertEqual(self.db.digit(u'\U00020000', None), None)
    111 
    112         self.assertRaises(TypeError, self.db.digit)
    113         self.assertRaises(TypeError, self.db.digit, u'xx')
    114         self.assertRaises(ValueError, self.db.digit, u'x')
    115 
    116     def test_numeric(self):
    117         self.assertEqual(self.db.numeric(u'A',None), None)
    118         self.assertEqual(self.db.numeric(u'9'), 9)
    119         self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
    120         self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
    121         self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
    122         self.assertEqual(self.db.numeric(u'\U00020000', None), None)
    123 
    124         self.assertRaises(TypeError, self.db.numeric)
    125         self.assertRaises(TypeError, self.db.numeric, u'xx')
    126         self.assertRaises(ValueError, self.db.numeric, u'x')
    127 
    128     def test_decimal(self):
    129         self.assertEqual(self.db.decimal(u'A',None), None)
    130         self.assertEqual(self.db.decimal(u'9'), 9)
    131         self.assertEqual(self.db.decimal(u'\u215b', None), None)
    132         self.assertEqual(self.db.decimal(u'\u2468', None), None)
    133         self.assertEqual(self.db.decimal(u'\U00020000', None), None)
    134 
    135         self.assertRaises(TypeError, self.db.decimal)
    136         self.assertRaises(TypeError, self.db.decimal, u'xx')
    137         self.assertRaises(ValueError, self.db.decimal, u'x')
    138 
    139     def test_category(self):
    140         self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
    141         self.assertEqual(self.db.category(u'a'), 'Ll')
    142         self.assertEqual(self.db.category(u'A'), 'Lu')
    143         self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
    144 
    145         self.assertRaises(TypeError, self.db.category)
    146         self.assertRaises(TypeError, self.db.category, u'xx')
    147 
    148     def test_bidirectional(self):
    149         self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
    150         self.assertEqual(self.db.bidirectional(u' '), 'WS')
    151         self.assertEqual(self.db.bidirectional(u'A'), 'L')
    152         self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
    153 
    154         self.assertRaises(TypeError, self.db.bidirectional)
    155         self.assertRaises(TypeError, self.db.bidirectional, u'xx')
    156 
    157     def test_decomposition(self):
    158         self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
    159         self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
    160 
    161         self.assertRaises(TypeError, self.db.decomposition)
    162         self.assertRaises(TypeError, self.db.decomposition, u'xx')
    163 
    164     def test_mirrored(self):
    165         self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
    166         self.assertEqual(self.db.mirrored(u'a'), 0)
    167         self.assertEqual(self.db.mirrored(u'\u2201'), 1)
    168         self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
    169 
    170         self.assertRaises(TypeError, self.db.mirrored)
    171         self.assertRaises(TypeError, self.db.mirrored, u'xx')
    172 
    173     def test_combining(self):
    174         self.assertEqual(self.db.combining(u'\uFFFE'), 0)
    175         self.assertEqual(self.db.combining(u'a'), 0)
    176         self.assertEqual(self.db.combining(u'\u20e1'), 230)
    177         self.assertEqual(self.db.combining(u'\U00020000'), 0)
    178 
    179         self.assertRaises(TypeError, self.db.combining)
    180         self.assertRaises(TypeError, self.db.combining, u'xx')
    181 
    182     def test_normalize(self):
    183         self.assertRaises(TypeError, self.db.normalize)
    184         self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
    185         self.assertEqual(self.db.normalize('NFKC', u''), u'')
    186         # The rest can be found in test_normalization.py
    187         # which requires an external file.
    188 
    189     def test_pr29(self):
    190         # http://www.unicode.org/review/pr-29.html
    191         # See issues #1054943 and #10254.
    192         composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
    193                     u'Li\u030dt-s\u1e73\u0301',
    194                     u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
    195                     + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
    196                     u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
    197                     + 'u\u0938\u094d\u0924\u093e\u0928')
    198         for text in composed:
    199             self.assertEqual(self.db.normalize('NFC', text), text)
    200 
    201     def test_issue10254(self):
    202         # Crash reported in #10254
    203         a = u'C\u0338' * 20  + u'C\u0327'
    204         b = u'C\u0338' * 20  + u'\xC7'
    205         self.assertEqual(self.db.normalize('NFC', a), b)
    206 
    207     def test_east_asian_width(self):
    208         eaw = self.db.east_asian_width
    209         self.assertRaises(TypeError, eaw, 'a')
    210         self.assertRaises(TypeError, eaw, u'')
    211         self.assertRaises(TypeError, eaw, u'ra')
    212         self.assertEqual(eaw(u'\x1e'), 'N')
    213         self.assertEqual(eaw(u'\x20'), 'Na')
    214         self.assertEqual(eaw(u'\uC894'), 'W')
    215         self.assertEqual(eaw(u'\uFF66'), 'H')
    216         self.assertEqual(eaw(u'\uFF1F'), 'F')
    217         self.assertEqual(eaw(u'\u2010'), 'A')
    218         self.assertEqual(eaw(u'\U00020000'), 'W')
    219 
    220 class UnicodeMiscTest(UnicodeDatabaseTest):
    221 
    222     def test_failed_import_during_compiling(self):
    223         # Issue 4367
    224         # Decoding \N escapes requires the unicodedata module. If it can't be
    225         # imported, we shouldn't segfault.
    226 
    227         # This program should raise a SyntaxError in the eval.
    228         code = "import sys;" \
    229             "sys.modules['unicodedata'] = None;" \
    230             """eval("u'\N{SOFT HYPHEN}'")"""
    231         args = [sys.executable, "-c", code]
    232         # We use a subprocess because the unicodedata module may already have
    233         # been loaded in this process.
    234         popen = subprocess.Popen(args, stderr=subprocess.PIPE)
    235         popen.wait()
    236         self.assertEqual(popen.returncode, 1)
    237         error = "SyntaxError: (unicode error) \N escapes not supported " \
    238             "(can't load unicodedata module)"
    239         self.assertIn(error, popen.stderr.read())
    240 
    241     def test_decimal_numeric_consistent(self):
    242         # Test that decimal and numeric are consistent,
    243         # i.e. if a character has a decimal value,
    244         # its numeric value should be the same.
    245         count = 0
    246         for i in xrange(0x10000):
    247             c = unichr(i)
    248             dec = self.db.decimal(c, -1)
    249             if dec != -1:
    250                 self.assertEqual(dec, self.db.numeric(c))
    251                 count += 1
    252         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    253 
    254     def test_digit_numeric_consistent(self):
    255         # Test that digit and numeric are consistent,
    256         # i.e. if a character has a digit value,
    257         # its numeric value should be the same.
    258         count = 0
    259         for i in xrange(0x10000):
    260             c = unichr(i)
    261             dec = self.db.digit(c, -1)
    262             if dec != -1:
    263                 self.assertEqual(dec, self.db.numeric(c))
    264                 count += 1
    265         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
    266 
    267     def test_bug_1704793(self):
    268         self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
    269 
    270     def test_ucd_510(self):
    271         import unicodedata
    272         # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
    273         self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
    274         self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
    275         # Also, we now have two ways of representing
    276         # the upper-case mapping: as delta, or as absolute value
    277         self.assertTrue(u"a".upper()==u'A')
    278         self.assertTrue(u"\u1d79".upper()==u'\ua77d')
    279         self.assertTrue(u".".upper()==u".")
    280 
    281     def test_bug_5828(self):
    282         self.assertEqual(u"\u1d79".lower(), u"\u1d79")
    283         # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
    284         self.assertEqual(
    285             [
    286                 c for c in range(sys.maxunicode+1)
    287                 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
    288             ],
    289             [0]
    290         )
    291 
    292     def test_bug_4971(self):
    293         # LETTER DZ WITH CARON: DZ, Dz, dz
    294         self.assertEqual(u"\u01c4".title(), u"\u01c5")
    295         self.assertEqual(u"\u01c5".title(), u"\u01c5")
    296         self.assertEqual(u"\u01c6".title(), u"\u01c5")
    297 
    298     def test_linebreak_7643(self):
    299         for i in range(0x10000):
    300             lines = (unichr(i) + u'A').splitlines()
    301             if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
    302                      0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
    303                 self.assertEqual(len(lines), 2,
    304                                  r"\u%.4x should be a linebreak" % i)
    305             else:
    306                 self.assertEqual(len(lines), 1,
    307                                  r"\u%.4x should not be a linebreak" % i)
    308 
    309 def test_main():
    310     test.test_support.run_unittest(
    311         UnicodeMiscTest,
    312         UnicodeMethodsTest,
    313         UnicodeFunctionsTest
    314     )
    315 
    316 if __name__ == "__main__":
    317     test_main()
    318