Home | History | Annotate | Download | only in test
      1 #!/usr/bin/env python

      2 #

      3 # test_multibytecodec.py

      4 #   Unit test for multibytecodec itself

      5 #

      6 
      7 from test import test_support
      8 from test.test_support import TESTFN
      9 import unittest, StringIO, codecs, sys, os
     10 import _multibytecodec
     11 
     12 ALL_CJKENCODINGS = [
     13 # _codecs_cn

     14     'gb2312', 'gbk', 'gb18030', 'hz',
     15 # _codecs_hk

     16     'big5hkscs',
     17 # _codecs_jp

     18     'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
     19     'euc_jis_2004', 'shift_jis_2004',
     20 # _codecs_kr

     21     'cp949', 'euc_kr', 'johab',
     22 # _codecs_tw

     23     'big5', 'cp950',
     24 # _codecs_iso2022

     25     'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
     26     'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
     27 ]
     28 
     29 class Test_MultibyteCodec(unittest.TestCase):
     30 
     31     def test_nullcoding(self):
     32         for enc in ALL_CJKENCODINGS:
     33             self.assertEqual(''.decode(enc), u'')
     34             self.assertEqual(unicode('', enc), u'')
     35             self.assertEqual(u''.encode(enc), '')
     36 
     37     def test_str_decode(self):
     38         for enc in ALL_CJKENCODINGS:
     39             self.assertEqual('abcd'.encode(enc), 'abcd')
     40 
     41     def test_errorcallback_longindex(self):
     42         dec = codecs.getdecoder('euc-kr')
     43         myreplace  = lambda exc: (u'', sys.maxint+1)
     44         codecs.register_error('test.cjktest', myreplace)
     45         self.assertRaises(IndexError, dec,
     46                           'apple\x92ham\x93spam', 'test.cjktest')
     47 
     48     def test_codingspec(self):
     49         for enc in ALL_CJKENCODINGS:
     50             code = '# coding: {}\n'.format(enc)
     51             exec code
     52 
     53     def test_init_segfault(self):
     54         # bug #3305: this used to segfault

     55         self.assertRaises(AttributeError,
     56                           _multibytecodec.MultibyteStreamReader, None)
     57         self.assertRaises(AttributeError,
     58                           _multibytecodec.MultibyteStreamWriter, None)
     59 
     60 
     61 class Test_IncrementalEncoder(unittest.TestCase):
     62 
     63     def test_stateless(self):
     64         # cp949 encoder isn't stateful at all.

     65         encoder = codecs.getincrementalencoder('cp949')()
     66         self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
     67                          '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
     68         self.assertEqual(encoder.reset(), None)
     69         self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
     70                          '\xa1\xd9\xa1\xad\xa1\xd9')
     71         self.assertEqual(encoder.reset(), None)
     72         self.assertEqual(encoder.encode(u'', True), '')
     73         self.assertEqual(encoder.encode(u'', False), '')
     74         self.assertEqual(encoder.reset(), None)
     75 
     76     def test_stateful(self):
     77         # jisx0213 encoder is stateful for a few codepoints. eg)

     78         #   U+00E6 => A9DC

     79         #   U+00E6 U+0300 => ABC4

     80         #   U+0300 => ABDC

     81 
     82         encoder = codecs.getincrementalencoder('jisx0213')()
     83         self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
     84         self.assertEqual(encoder.encode(u'\u00e6'), '')
     85         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
     86         self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
     87 
     88         self.assertEqual(encoder.reset(), None)
     89         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
     90 
     91         self.assertEqual(encoder.encode(u'\u00e6'), '')
     92         self.assertEqual(encoder.encode('', True), '\xa9\xdc')
     93         self.assertEqual(encoder.encode('', True), '')
     94 
     95     def test_stateful_keep_buffer(self):
     96         encoder = codecs.getincrementalencoder('jisx0213')()
     97         self.assertEqual(encoder.encode(u'\u00e6'), '')
     98         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
     99         self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
    100         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
    101         self.assertEqual(encoder.reset(), None)
    102         self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
    103         self.assertEqual(encoder.encode(u'\u00e6'), '')
    104         self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
    105         self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
    106 
    107     def test_issue5640(self):
    108         encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
    109         self.assertEqual(encoder.encode(u'\xff'), b'\\xff')
    110         self.assertEqual(encoder.encode(u'\n'), b'\n')
    111 
    112 class Test_IncrementalDecoder(unittest.TestCase):
    113 
    114     def test_dbcs(self):
    115         # cp949 decoder is simple with only 1 or 2 bytes sequences.

    116         decoder = codecs.getincrementaldecoder('cp949')()
    117         self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
    118                          u'\ud30c\uc774')
    119         self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
    120                          u'\uc36c \ub9c8\uc744')
    121         self.assertEqual(decoder.decode(''), u'')
    122 
    123     def test_dbcs_keep_buffer(self):
    124         decoder = codecs.getincrementaldecoder('cp949')()
    125         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
    126         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
    127         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
    128 
    129         self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
    130         self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
    131         self.assertEqual(decoder.decode('\xcc'), u'\uc774')
    132 
    133     def test_iso2022(self):
    134         decoder = codecs.getincrementaldecoder('iso2022-jp')()
    135         ESC = '\x1b'
    136         self.assertEqual(decoder.decode(ESC + '('), u'')
    137         self.assertEqual(decoder.decode('B', True), u'')
    138         self.assertEqual(decoder.decode(ESC + '$'), u'')
    139         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
    140         self.assertEqual(decoder.decode('@$@'), u'\u4e16')
    141         self.assertEqual(decoder.decode('$', True), u'\u4e16')
    142         self.assertEqual(decoder.reset(), None)
    143         self.assertEqual(decoder.decode('@$'), u'@$')
    144         self.assertEqual(decoder.decode(ESC + '$'), u'')
    145         self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
    146         self.assertEqual(decoder.decode('B@$'), u'\u4e16')
    147 
    148 class Test_StreamReader(unittest.TestCase):
    149     def test_bug1728403(self):
    150         try:
    151             open(TESTFN, 'w').write('\xa1')
    152             f = codecs.open(TESTFN, encoding='cp949')
    153             self.assertRaises(UnicodeDecodeError, f.read, 2)
    154         finally:
    155             try: f.close()
    156             except: pass
    157             os.unlink(TESTFN)
    158 
    159 class Test_StreamWriter(unittest.TestCase):
    160     if len(u'\U00012345') == 2: # UCS2

    161         def test_gb18030(self):
    162             s = StringIO.StringIO()
    163             c = codecs.getwriter('gb18030')(s)
    164             c.write(u'123')
    165             self.assertEqual(s.getvalue(), '123')
    166             c.write(u'\U00012345')
    167             self.assertEqual(s.getvalue(), '123\x907\x959')
    168             c.write(u'\U00012345'[0])
    169             self.assertEqual(s.getvalue(), '123\x907\x959')
    170             c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
    171             self.assertEqual(s.getvalue(),
    172                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    173             c.write(u'\U00012345'[0])
    174             self.assertEqual(s.getvalue(),
    175                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    176             self.assertRaises(UnicodeError, c.reset)
    177             self.assertEqual(s.getvalue(),
    178                     '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
    179 
    180         def test_utf_8(self):
    181             s= StringIO.StringIO()
    182             c = codecs.getwriter('utf-8')(s)
    183             c.write(u'123')
    184             self.assertEqual(s.getvalue(), '123')
    185             c.write(u'\U00012345')
    186             self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
    187 
    188             # Python utf-8 codec can't buffer surrogate pairs yet.

    189             if 0:
    190                 c.write(u'\U00012345'[0])
    191                 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
    192                 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
    193                 self.assertEqual(s.getvalue(),
    194                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    195                     '\xea\xb0\x80\xc2\xac')
    196                 c.write(u'\U00012345'[0])
    197                 self.assertEqual(s.getvalue(),
    198                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    199                     '\xea\xb0\x80\xc2\xac')
    200                 c.reset()
    201                 self.assertEqual(s.getvalue(),
    202                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    203                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
    204                 c.write(u'\U00012345'[1])
    205                 self.assertEqual(s.getvalue(),
    206                     '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
    207                     '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
    208 
    209     else: # UCS4

    210         pass
    211 
    212     def test_streamwriter_strwrite(self):
    213         s = StringIO.StringIO()
    214         wr = codecs.getwriter('gb18030')(s)
    215         wr.write('abcd')
    216         self.assertEqual(s.getvalue(), 'abcd')
    217 
    218 class Test_ISO2022(unittest.TestCase):
    219     def test_g2(self):
    220         iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
    221         uni = u':hu4:unit\xe9 de famille'
    222         self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
    223 
    224     def test_iso2022_jp_g0(self):
    225         self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
    226         for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
    227             e = u'\u3406'.encode(encoding)
    228             self.assertFalse(filter(lambda x: x >= '\x80', e))
    229 
    230     def test_bug1572832(self):
    231         if sys.maxunicode >= 0x10000:
    232             myunichr = unichr
    233         else:
    234             myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
    235 
    236         for x in xrange(0x10000, 0x110000):
    237             # Any ISO 2022 codec will cause the segfault

    238             myunichr(x).encode('iso_2022_jp', 'ignore')
    239 
    240 class TestStateful(unittest.TestCase):
    241     text = u'\u4E16\u4E16'
    242     encoding = 'iso-2022-jp'
    243     expected = b'\x1b$B@$@$'
    244     expected_reset = b'\x1b$B@$@$\x1b(B'
    245 
    246     def test_encode(self):
    247         self.assertEqual(self.text.encode(self.encoding), self.expected_reset)
    248 
    249     def test_incrementalencoder(self):
    250         encoder = codecs.getincrementalencoder(self.encoding)()
    251         output = b''.join(
    252             encoder.encode(char)
    253             for char in self.text)
    254         self.assertEqual(output, self.expected)
    255 
    256     def test_incrementalencoder_final(self):
    257         encoder = codecs.getincrementalencoder(self.encoding)()
    258         last_index = len(self.text) - 1
    259         output = b''.join(
    260             encoder.encode(char, index == last_index)
    261             for index, char in enumerate(self.text))
    262         self.assertEqual(output, self.expected_reset)
    263 
    264 class TestHZStateful(TestStateful):
    265     text = u'\u804a\u804a'
    266     encoding = 'hz'
    267     expected = b'~{ADAD'
    268     expected_reset = b'~{ADAD~}'
    269 
    270 def test_main():
    271     test_support.run_unittest(__name__)
    272 
    273 if __name__ == "__main__":
    274     test_main()
    275