Home | History | Annotate | Download | only in test
      1 # test_multibytecodec.py
      2 #   Unit test for multibytecodec itself
      3 #
      4 
      5 from test import test_support
      6 from test.test_support import TESTFN
      7 import unittest, StringIO, codecs, sys, os
      8 import _multibytecodec
      9 
     10 ALL_CJKENCODINGS = [
     11 # _codecs_cn
     12     'gb2312', 'gbk', 'gb18030', 'hz',
     13 # _codecs_hk
     14     'big5hkscs',
     15 # _codecs_jp
     16     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
     17     'euc_jis_2004', 'shift_jis_2004',
     18 # _codecs_kr
     19     'cp949', 'euc_kr', 'johab',
     20 # _codecs_tw
     21     'big5', 'cp950',
     22 # _codecs_iso2022
     23     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
     24     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
     25 ]
     26 
     27 class Test_MultibyteCodec(unittest.TestCase):
     28 
     29     def test_nullcoding(self):
     30         for enc in ALL_CJKENCODINGS:
     31             self.assertEqual(''.decode(enc), u'')
     32             self.assertEqual(unicode('', enc), u'')
     33             self.assertEqual(u''.encode(enc), '')
     34 
     35     def test_str_decode(self):
     36         for enc in ALL_CJKENCODINGS:
     37             self.assertEqual('abcd'.encode(enc), 'abcd')
     38 
     39     def test_errorcallback_longindex(self):
     40         dec = codecs.getdecoder('euc-kr')
     41         myreplace  = lambda exc: (u'', sys.maxint+1)
     42         codecs.register_error('test.cjktest', myreplace)
     43         self.assertRaises(IndexError, dec,
     44                           'apple\x92ham\x93spam', 'test.cjktest')
     45 
     46     def test_errorcallback_custom_ignore(self):
     47         # Issue #23215: MemoryError with custom error handlers and multibyte codecs
     48         data = 100 * unichr(0xdc00)
     49         codecs.register_error("test.ignore", codecs.ignore_errors)
     50         for enc in ALL_CJKENCODINGS:
     51             self.assertEqual(data.encode(enc, "test.ignore"), b'')
     52 
     53     def test_codingspec(self):
     54         for enc in ALL_CJKENCODINGS:
     55             code = '# coding: {}\n'.format(enc)
     56             exec code
     57 
     58     def test_init_segfault(self):
     59         # bug #3305: this used to segfault
     60         self.assertRaises(AttributeError,
     61                           _multibytecodec.MultibyteStreamReader, None)
     62         self.assertRaises(AttributeError,
     63                           _multibytecodec.MultibyteStreamWriter, None)
     64 
     65 
     66 class Test_IncrementalEncoder(unittest.TestCase):
     67 
     68     def test_stateless(self):
     69         # cp949 encoder isn't stateful at all.
     70         encoder = codecs.getincrementalencoder('cp949')()
     71         self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
     72                          '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
     73         self.assertEqual(encoder.reset(), None)
     74         self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
     75                          '\xa1\xd9\xa1\xad\xa1\xd9')
     76         self.assertEqual(encoder.reset(), None)
     77         self.assertEqual(encoder.encode(u'', True), '')
     78         self.assertEqual(encoder.encode(u'', False), '')
     79         self.assertEqual(encoder.reset(), None)
     80 
     81     def test_stateful(self):
     82         # jisx0213 encoder is stateful for a few code points. eg)
     83         #   U+00E6 => A9DC
     84         #   U+00E6 U+0300 => ABC4
     85         #   U+0300 => ABDC
     86 
     87         encoder = codecs.getincrementalencoder('jisx0213')()
     88         self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
     89         self.assertEqual(encoder.encode(u'\u00e6'), '')
     90         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
     91         self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
     92 
     93         self.assertEqual(encoder.reset(), None)
     94         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
     95 
     96         self.assertEqual(encoder.encode(u'\u00e6'), '')
     97         self.assertEqual(encoder.encode('', True), '\xa9\xdc')
     98         self.assertEqual(encoder.encode('', True), '')
     99 
    100     def test_stateful_keep_buffer(self):
    101         encoder = codecs.getincrementalencoder('jisx0213')()
    102         self.assertEqual(encoder.encode(u'\u00e6'), '')
    103         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
    104         self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
    105         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
    106         self.assertEqual(encoder.reset(), None)
    107         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
    108         self.assertEqual(encoder.encode(u'\u00e6'), '')
    109         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
    110         self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
    111 
    112     def test_issue5640(self):
    113         encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
    114         self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
    115         self.assertEqual(encoder.encode(u'\n'), b'\n')
    116 
    117 class Test_IncrementalDecoder(unittest.TestCase):
    118 
    119     def test_dbcs(self):
    120         # cp949 decoder is simple with only 1 or 2 bytes sequences.
    121         decoder = codecs.getincrementaldecoder('cp949')()
    122         self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
    123                          u'\ud30c\uc774')
    124         self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
    125                          u'\uc36c \ub9c8\uc744')
    126         self.assertEqual(decoder.decode(''), u'')
    127 
    128     def test_dbcs_keep_buffer(self):
    129         decoder = codecs.getincrementaldecoder('cp949')()
    130         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
    131         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
    132         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
    133 
    134         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
    135         self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
    136         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
    137 
    138     def test_iso2022(self):
    139         decoder = codecs.getincrementaldecoder('iso2022-jp')()
    140         ESC = '\x1b'
    141         self.assertEqual(decoder.decode(ESC + '('), u'')
    142         self.assertEqual(decoder.decode('B', True), u'')
    143         self.assertEqual(decoder.decode(ESC + '$'), u'')
    144         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
    145         self.assertEqual(decoder.decode('@$@'), u'\u4e16')
    146         self.assertEqual(decoder.decode('$', True), u'\u4e16')
    147         self.assertEqual(decoder.reset(), None)
    148         self.assertEqual(decoder.decode('@$'), u'@$')
    149         self.assertEqual(decoder.decode(ESC + '$'), u'')
    150         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
    151         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
    152 
    153 class Test_StreamReader(unittest.TestCase):
    154     def test_bug1728403(self):
    155         try:
    156             open(TESTFN, 'w').write('\xa1')
    157             f = codecs.open(TESTFN, encoding='cp949')
    158             self.assertRaises(UnicodeDecodeError, f.read, 2)
    159         finally:
    160             try: f.close()
    161             except: pass
    162             os.unlink(TESTFN)
    163 
    164 class Test_StreamWriter(unittest.TestCase):
    165     @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
    166     def test_gb18030(self):
    167         s = StringIO.StringIO()
    168         c = codecs.getwriter('gb18030')(s)
    169         c.write(u'123')
    170         self.assertEqual(s.getvalue(), '123')
    171         c.write(u'\U00012345')
    172         self.assertEqual(s.getvalue(), '123\x907\x959')
    173         c.write(u'\U00012345'[0])
    174         self.assertEqual(s.getvalue(), '123\x907\x959')
    175         c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
    176         self.assertEqual(s.getvalue(),
    177                 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    178         c.write(u'\U00012345'[0])
    179         self.assertEqual(s.getvalue(),
    180                 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    181         self.assertRaises(UnicodeError, c.reset)
    182         self.assertEqual(s.getvalue(),
    183                 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    184 
    185     @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build')
    186     def test_utf_8(self):
    187         s= StringIO.StringIO()
    188         c = codecs.getwriter('utf-8')(s)
    189         c.write(u'123')
    190         self.assertEqual(s.getvalue(), '123')
    191         c.write(u'\U00012345')
    192         self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
    193 
    194         # Python utf-8 codec can't buffer surrogate pairs yet.
    195         if 0:
    196             c.write(u'\U00012345'[0])
    197             self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
    198             c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
    199             self.assertEqual(s.getvalue(),
    200                 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    201                 '\xea\xb0\x80\xc2\xac')
    202             c.write(u'\U00012345'[0])
    203             self.assertEqual(s.getvalue(),
    204                 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    205                 '\xea\xb0\x80\xc2\xac')
    206             c.reset()
    207             self.assertEqual(s.getvalue(),
    208                 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    209                 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
    210             c.write(u'\U00012345'[1])
    211             self.assertEqual(s.getvalue(),
    212                 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    213                 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
    214 
    215     def test_streamwriter_strwrite(self):
    216         s = StringIO.StringIO()
    217         wr = codecs.getwriter('gb18030')(s)
    218         wr.write('abcd')
    219         self.assertEqual(s.getvalue(), 'abcd')
    220 
    221 class Test_ISO2022(unittest.TestCase):
    222     def test_g2(self):
    223         iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
    224         uni = u':hu4:unit\xe9 de famille'
    225         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
    226 
    227     def test_iso2022_jp_g0(self):
    228         self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
    229         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
    230             e = u'\u3406'.encode(encoding)
    231             self.assertFalse(filter(lambda x: x >= '\x80', e))
    232 
    233     def test_bug1572832(self):
    234         if sys.maxunicode >= 0x10000:
    235             myunichr = unichr
    236         else:
    237             myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
    238 
    239         for x in xrange(0x10000, 0x110000):
    240             # Any ISO 2022 codec will cause the segfault
    241             myunichr(x).encode('iso_2022_jp', 'ignore')
    242 
    243 class TestStateful(unittest.TestCase):
    244     text = u'\u4E16\u4E16'
    245     encoding = 'iso-2022-jp'
    246     expected = b'\x1b$B@$@$'
    247     expected_reset = b'\x1b$B@$@$\x1b(B'
    248 
    249     def test_encode(self):
    250         self.assertEqual(self.text.encode(self.encoding), self.expected_reset)
    251 
    252     def test_incrementalencoder(self):
    253         encoder = codecs.getincrementalencoder(self.encoding)()
    254         output = b''.join(
    255             encoder.encode(char)
    256             for char in self.text)
    257         self.assertEqual(output, self.expected)
    258 
    259     def test_incrementalencoder_final(self):
    260         encoder = codecs.getincrementalencoder(self.encoding)()
    261         last_index = len(self.text) - 1
    262         output = b''.join(
    263             encoder.encode(char, index == last_index)
    264             for index, char in enumerate(self.text))
    265         self.assertEqual(output, self.expected_reset)
    266 
    267 class TestHZStateful(TestStateful):
    268     text = u'\u804a\u804a'
    269     encoding = 'hz'
    270     expected = b'~{ADAD'
    271     expected_reset = b'~{ADAD~}'
    272 
    273 def test_main():
    274     test_support.run_unittest(__name__)
    275 
    276 if __name__ == "__main__":
    277     test_main()
    278