Home | History | Annotate | Download | only in test
      1 #
      2 # test_multibytecodec.py
      3 #   Unit test for multibytecodec itself
      4 #
      5 
      6 from test import support
      7 from test.support import TESTFN
      8 import unittest, io, codecs, sys
      9 import _multibytecodec
     10 
     11 ALL_CJKENCODINGS = [
     12 # _codecs_cn
     13     'gb2312', 'gbk', 'gb18030', 'hz',
     14 # _codecs_hk
     15     'big5hkscs',
     16 # _codecs_jp
     17     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
     18     'euc_jis_2004', 'shift_jis_2004',
     19 # _codecs_kr
     20     'cp949', 'euc_kr', 'johab',
     21 # _codecs_tw
     22     'big5', 'cp950',
     23 # _codecs_iso2022
     24     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
     25     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
     26 ]
     27 
     28 class Test_MultibyteCodec(unittest.TestCase):
     29 
     30     def test_nullcoding(self):
     31         for enc in ALL_CJKENCODINGS:
     32             self.assertEqual(b''.decode(enc), '')
     33             self.assertEqual(str(b'', enc), '')
     34             self.assertEqual(''.encode(enc), b'')
     35 
     36     def test_str_decode(self):
     37         for enc in ALL_CJKENCODINGS:
     38             self.assertEqual('abcd'.encode(enc), b'abcd')
     39 
     40     def test_errorcallback_longindex(self):
     41         dec = codecs.getdecoder('euc-kr')
     42         myreplace  = lambda exc: ('', sys.maxsize+1)
     43         codecs.register_error('test.cjktest', myreplace)
     44         self.assertRaises(IndexError, dec,
     45                           b'apple\x92ham\x93spam', 'test.cjktest')
     46 
     47     def test_errorcallback_custom_ignore(self):
     48         # Issue #23215: MemoryError with custom error handlers and multibyte codecs
     49         data = 100 * "\udc00"
     50         codecs.register_error("test.ignore", codecs.ignore_errors)
     51         for enc in ALL_CJKENCODINGS:
     52             self.assertEqual(data.encode(enc, "test.ignore"), b'')
     53 
     54     def test_codingspec(self):
     55         try:
     56             for enc in ALL_CJKENCODINGS:
     57                 code = '# coding: {}\n'.format(enc)
     58                 exec(code)
     59         finally:
     60             support.unlink(TESTFN)
     61 
     62     def test_init_segfault(self):
     63         # bug #3305: this used to segfault
     64         self.assertRaises(AttributeError,
     65                           _multibytecodec.MultibyteStreamReader, None)
     66         self.assertRaises(AttributeError,
     67                           _multibytecodec.MultibyteStreamWriter, None)
     68 
     69     def test_decode_unicode(self):
     70         # Trying to decode a unicode string should raise a TypeError
     71         for enc in ALL_CJKENCODINGS:
     72             self.assertRaises(TypeError, codecs.getdecoder(enc), "")
     73 
     74 class Test_IncrementalEncoder(unittest.TestCase):
     75 
     76     def test_stateless(self):
     77         # cp949 encoder isn't stateful at all.
     78         encoder = codecs.getincrementalencoder('cp949')()
     79         self.assertEqual(encoder.encode('\ud30c\uc774\uc36c \ub9c8\uc744'),
     80                          b'\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
     81         self.assertEqual(encoder.reset(), None)
     82         self.assertEqual(encoder.encode('\u2606\u223c\u2606', True),
     83                          b'\xa1\xd9\xa1\xad\xa1\xd9')
     84         self.assertEqual(encoder.reset(), None)
     85         self.assertEqual(encoder.encode('', True), b'')
     86         self.assertEqual(encoder.encode('', False), b'')
     87         self.assertEqual(encoder.reset(), None)
     88 
     89     def test_stateful(self):
     90         # jisx0213 encoder is stateful for a few code points. eg)
     91         #   U+00E6 => A9DC
     92         #   U+00E6 U+0300 => ABC4
     93         #   U+0300 => ABDC
     94 
     95         encoder = codecs.getincrementalencoder('jisx0213')()
     96         self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
     97         self.assertEqual(encoder.encode('\u00e6'), b'')
     98         self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
     99         self.assertEqual(encoder.encode('\u00e6', True), b'\xa9\xdc')
    100 
    101         self.assertEqual(encoder.reset(), None)
    102         self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
    103 
    104         self.assertEqual(encoder.encode('\u00e6'), b'')
    105         self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
    106         self.assertEqual(encoder.encode('', True), b'')
    107 
    108     def test_stateful_keep_buffer(self):
    109         encoder = codecs.getincrementalencoder('jisx0213')()
    110         self.assertEqual(encoder.encode('\u00e6'), b'')
    111         self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
    112         self.assertEqual(encoder.encode('\u0300\u00e6'), b'\xab\xc4')
    113         self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
    114         self.assertEqual(encoder.reset(), None)
    115         self.assertEqual(encoder.encode('\u0300'), b'\xab\xdc')
    116         self.assertEqual(encoder.encode('\u00e6'), b'')
    117         self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
    118         self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
    119 
    120     def test_issue5640(self):
    121         encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
    122         self.assertEqual(encoder.encode('\xff'), b'\\xff')
    123         self.assertEqual(encoder.encode('\n'), b'\n')
    124 
    125 class Test_IncrementalDecoder(unittest.TestCase):
    126 
    127     def test_dbcs(self):
    128         # cp949 decoder is simple with only 1 or 2 bytes sequences.
    129         decoder = codecs.getincrementaldecoder('cp949')()
    130         self.assertEqual(decoder.decode(b'\xc6\xc4\xc0\xcc\xbd'),
    131                          '\ud30c\uc774')
    132         self.assertEqual(decoder.decode(b'\xe3 \xb8\xb6\xc0\xbb'),
    133                          '\uc36c \ub9c8\uc744')
    134         self.assertEqual(decoder.decode(b''), '')
    135 
    136     def test_dbcs_keep_buffer(self):
    137         decoder = codecs.getincrementaldecoder('cp949')()
    138         self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
    139         self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
    140         self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
    141 
    142         self.assertEqual(decoder.decode(b'\xc6\xc4\xc0'), '\ud30c')
    143         self.assertRaises(UnicodeDecodeError, decoder.decode,
    144                           b'\xcc\xbd', True)
    145         self.assertEqual(decoder.decode(b'\xcc'), '\uc774')
    146 
    147     def test_iso2022(self):
    148         decoder = codecs.getincrementaldecoder('iso2022-jp')()
    149         ESC = b'\x1b'
    150         self.assertEqual(decoder.decode(ESC + b'('), '')
    151         self.assertEqual(decoder.decode(b'B', True), '')
    152         self.assertEqual(decoder.decode(ESC + b'$'), '')
    153         self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
    154         self.assertEqual(decoder.decode(b'@$@'), '\u4e16')
    155         self.assertEqual(decoder.decode(b'$', True), '\u4e16')
    156         self.assertEqual(decoder.reset(), None)
    157         self.assertEqual(decoder.decode(b'@$'), '@$')
    158         self.assertEqual(decoder.decode(ESC + b'$'), '')
    159         self.assertRaises(UnicodeDecodeError, decoder.decode, b'', True)
    160         self.assertEqual(decoder.decode(b'B@$'), '\u4e16')
    161 
    162     def test_decode_unicode(self):
    163         # Trying to decode a unicode string should raise a TypeError
    164         for enc in ALL_CJKENCODINGS:
    165             decoder = codecs.getincrementaldecoder(enc)()
    166             self.assertRaises(TypeError, decoder.decode, "")
    167 
    168 class Test_StreamReader(unittest.TestCase):
    169     def test_bug1728403(self):
    170         try:
    171             f = open(TESTFN, 'wb')
    172             try:
    173                 f.write(b'\xa1')
    174             finally:
    175                 f.close()
    176             f = codecs.open(TESTFN, encoding='cp949')
    177             try:
    178                 self.assertRaises(UnicodeDecodeError, f.read, 2)
    179             finally:
    180                 f.close()
    181         finally:
    182             support.unlink(TESTFN)
    183 
    184 class Test_StreamWriter(unittest.TestCase):
    185     def test_gb18030(self):
    186         s= io.BytesIO()
    187         c = codecs.getwriter('gb18030')(s)
    188         c.write('123')
    189         self.assertEqual(s.getvalue(), b'123')
    190         c.write('\U00012345')
    191         self.assertEqual(s.getvalue(), b'123\x907\x959')
    192         c.write('\uac00\u00ac')
    193         self.assertEqual(s.getvalue(),
    194                 b'123\x907\x959\x827\xcf5\x810\x851')
    195 
    196     def test_utf_8(self):
    197         s= io.BytesIO()
    198         c = codecs.getwriter('utf-8')(s)
    199         c.write('123')
    200         self.assertEqual(s.getvalue(), b'123')
    201         c.write('\U00012345')
    202         self.assertEqual(s.getvalue(), b'123\xf0\x92\x8d\x85')
    203         c.write('\uac00\u00ac')
    204         self.assertEqual(s.getvalue(),
    205             b'123\xf0\x92\x8d\x85'
    206             b'\xea\xb0\x80\xc2\xac')
    207 
    208     def test_streamwriter_strwrite(self):
    209         s = io.BytesIO()
    210         wr = codecs.getwriter('gb18030')(s)
    211         wr.write('abcd')
    212         self.assertEqual(s.getvalue(), b'abcd')
    213 
    214 class Test_ISO2022(unittest.TestCase):
    215     def test_g2(self):
    216         iso2022jp2 = b'\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
    217         uni = ':hu4:unit\xe9 de famille'
    218         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
    219 
    220     def test_iso2022_jp_g0(self):
    221         self.assertNotIn(b'\x0e', '\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
    222         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
    223             e = '\u3406'.encode(encoding)
    224             self.assertFalse(any(x > 0x80 for x in e))
    225 
    226     def test_bug1572832(self):
    227         for x in range(0x10000, 0x110000):
    228             # Any ISO 2022 codec will cause the segfault
    229             chr(x).encode('iso_2022_jp', 'ignore')
    230 
    231 class TestStateful(unittest.TestCase):
    232     text = '\u4E16\u4E16'
    233     encoding = 'iso-2022-jp'
    234     expected = b'\x1b$B@$@$'
    235     reset = b'\x1b(B'
    236     expected_reset = expected + reset
    237 
    238     def test_encode(self):
    239         self.assertEqual(self.text.encode(self.encoding), self.expected_reset)
    240 
    241     def test_incrementalencoder(self):
    242         encoder = codecs.getincrementalencoder(self.encoding)()
    243         output = b''.join(
    244             encoder.encode(char)
    245             for char in self.text)
    246         self.assertEqual(output, self.expected)
    247         self.assertEqual(encoder.encode('', final=True), self.reset)
    248         self.assertEqual(encoder.encode('', final=True), b'')
    249 
    250     def test_incrementalencoder_final(self):
    251         encoder = codecs.getincrementalencoder(self.encoding)()
    252         last_index = len(self.text) - 1
    253         output = b''.join(
    254             encoder.encode(char, index == last_index)
    255             for index, char in enumerate(self.text))
    256         self.assertEqual(output, self.expected_reset)
    257         self.assertEqual(encoder.encode('', final=True), b'')
    258 
    259 class TestHZStateful(TestStateful):
    260     text = '\u804a\u804a'
    261     encoding = 'hz'
    262     expected = b'~{ADAD'
    263     reset = b'~}'
    264     expected_reset = expected + reset
    265 
    266 def test_main():
    267     support.run_unittest(__name__)
    268 
    269 if __name__ == "__main__":
    270     test_main()
    271