Home | History | Annotate | Download | only in test
      1 from test import test_support
      2 import unittest
      3 import codecs
      4 import locale
      5 import sys, StringIO, _testcapi
      6 
      7 def coding_checker(self, coder):
      8     def check(input, expect):
      9         self.assertEqual(coder(input), (expect, len(input)))
     10     return check
     11 
     12 class Queue(object):
     13     """
     14     queue: write bytes at one end, read bytes from the other end
     15     """
     16     def __init__(self):
     17         self._buffer = ""
     18 
     19     def write(self, chars):
     20         self._buffer += chars
     21 
     22     def read(self, size=-1):
     23         if size<0:
     24             s = self._buffer
     25             self._buffer = ""
     26             return s
     27         else:
     28             s = self._buffer[:size]
     29             self._buffer = self._buffer[size:]
     30             return s
     31 
     32 class ReadTest(unittest.TestCase):
     33     def check_partial(self, input, partialresults):
     34         # get a StreamReader for the encoding and feed the bytestring version
     35         # of input to the reader byte by byte. Read everything available from
     36         # the StreamReader and check that the results equal the appropriate
     37         # entries from partialresults.
     38         q = Queue()
     39         r = codecs.getreader(self.encoding)(q)
     40         result = u""
     41         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     42             q.write(c)
     43             result += r.read()
     44             self.assertEqual(result, partialresult)
     45         # check that there's nothing left in the buffers
     46         self.assertEqual(r.read(), u"")
     47         self.assertEqual(r.bytebuffer, "")
     48         self.assertEqual(r.charbuffer, u"")
     49 
     50         # do the check again, this time using a incremental decoder
     51         d = codecs.getincrementaldecoder(self.encoding)()
     52         result = u""
     53         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     54             result += d.decode(c)
     55             self.assertEqual(result, partialresult)
     56         # check that there's nothing left in the buffers
     57         self.assertEqual(d.decode("", True), u"")
     58         self.assertEqual(d.buffer, "")
     59 
     60         # Check whether the reset method works properly
     61         d.reset()
     62         result = u""
     63         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     64             result += d.decode(c)
     65             self.assertEqual(result, partialresult)
     66         # check that there's nothing left in the buffers
     67         self.assertEqual(d.decode("", True), u"")
     68         self.assertEqual(d.buffer, "")
     69 
     70         # check iterdecode()
     71         encoded = input.encode(self.encoding)
     72         self.assertEqual(
     73             input,
     74             u"".join(codecs.iterdecode(encoded, self.encoding))
     75         )
     76 
     77     def test_readline(self):
     78         def getreader(input):
     79             stream = StringIO.StringIO(input.encode(self.encoding))
     80             return codecs.getreader(self.encoding)(stream)
     81 
     82         def readalllines(input, keepends=True, size=None):
     83             reader = getreader(input)
     84             lines = []
     85             while True:
     86                 line = reader.readline(size=size, keepends=keepends)
     87                 if not line:
     88                     break
     89                 lines.append(line)
     90             return "|".join(lines)
     91 
     92         s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
     93         sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
     94         sexpectednoends = u"foo|bar|baz|spam|eggs"
     95         self.assertEqual(readalllines(s, True), sexpected)
     96         self.assertEqual(readalllines(s, False), sexpectednoends)
     97         self.assertEqual(readalllines(s, True, 10), sexpected)
     98         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
     99 
    100         # Test long lines (multiple calls to read() in readline())
    101         vw = []
    102         vwo = []
    103         for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
    104             vw.append((i*200)*u"\3042" + lineend)
    105             vwo.append((i*200)*u"\3042")
    106         self.assertEqual(readalllines("".join(vw), True), "".join(vw))
    107         self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
    108 
    109         # Test lines where the first read might end with \r, so the
    110         # reader has to look ahead whether this is a lone \r or a \r\n
    111         for size in xrange(80):
    112             for lineend in u"\n \r\n \r \u2028".split():
    113                 s = 10*(size*u"a" + lineend + u"xxx\n")
    114                 reader = getreader(s)
    115                 for i in xrange(10):
    116                     self.assertEqual(
    117                         reader.readline(keepends=True),
    118                         size*u"a" + lineend,
    119                     )
    120                 reader = getreader(s)
    121                 for i in xrange(10):
    122                     self.assertEqual(
    123                         reader.readline(keepends=False),
    124                         size*u"a",
    125                     )
    126 
    127     def test_bug1175396(self):
    128         s = [
    129             '<%!--===================================================\r\n',
    130             '    BLOG index page: show recent articles,\r\n',
    131             '    today\'s articles, or articles of a specific date.\r\n',
    132             '========================================================--%>\r\n',
    133             '<%@inputencoding="ISO-8859-1"%>\r\n',
    134             '<%@pagetemplate=TEMPLATE.y%>\r\n',
    135             '<%@import=import frog.util, frog%>\r\n',
    136             '<%@import=import frog.objects%>\r\n',
    137             '<%@import=from frog.storageerrors import StorageError%>\r\n',
    138             '<%\r\n',
    139             '\r\n',
    140             'import logging\r\n',
    141             'log=logging.getLogger("Snakelets.logger")\r\n',
    142             '\r\n',
    143             '\r\n',
    144             'user=self.SessionCtx.user\r\n',
    145             'storageEngine=self.SessionCtx.storageEngine\r\n',
    146             '\r\n',
    147             '\r\n',
    148             'def readArticlesFromDate(date, count=None):\r\n',
    149             '    entryids=storageEngine.listBlogEntries(date)\r\n',
    150             '    entryids.reverse() # descending\r\n',
    151             '    if count:\r\n',
    152             '        entryids=entryids[:count]\r\n',
    153             '    try:\r\n',
    154             '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
    155             '    except StorageError,x:\r\n',
    156             '        log.error("Error loading articles: "+str(x))\r\n',
    157             '        self.abort("cannot load articles")\r\n',
    158             '\r\n',
    159             'showdate=None\r\n',
    160             '\r\n',
    161             'arg=self.Request.getArg()\r\n',
    162             'if arg=="today":\r\n',
    163             '    #-------------------- TODAY\'S ARTICLES\r\n',
    164             '    self.write("<h2>Today\'s articles</h2>")\r\n',
    165             '    showdate = frog.util.isodatestr() \r\n',
    166             '    entries = readArticlesFromDate(showdate)\r\n',
    167             'elif arg=="active":\r\n',
    168             '    #-------------------- ACTIVE ARTICLES redirect\r\n',
    169             '    self.Yredirect("active.y")\r\n',
    170             'elif arg=="login":\r\n',
    171             '    #-------------------- LOGIN PAGE redirect\r\n',
    172             '    self.Yredirect("login.y")\r\n',
    173             'elif arg=="date":\r\n',
    174             '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
    175             '    showdate = self.Request.getParameter("date")\r\n',
    176             '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
    177             '    entries = readArticlesFromDate(showdate)\r\n',
    178             'else:\r\n',
    179             '    #-------------------- RECENT ARTICLES\r\n',
    180             '    self.write("<h2>Recent articles</h2>")\r\n',
    181             '    dates=storageEngine.listBlogEntryDates()\r\n',
    182             '    if dates:\r\n',
    183             '        entries=[]\r\n',
    184             '        SHOWAMOUNT=10\r\n',
    185             '        for showdate in dates:\r\n',
    186             '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
    187             '            if len(entries)>=SHOWAMOUNT:\r\n',
    188             '                break\r\n',
    189             '                \r\n',
    190         ]
    191         stream = StringIO.StringIO("".join(s).encode(self.encoding))
    192         reader = codecs.getreader(self.encoding)(stream)
    193         for (i, line) in enumerate(reader):
    194             self.assertEqual(line, s[i])
    195 
    196     def test_readlinequeue(self):
    197         q = Queue()
    198         writer = codecs.getwriter(self.encoding)(q)
    199         reader = codecs.getreader(self.encoding)(q)
    200 
    201         # No lineends
    202         writer.write(u"foo\r")
    203         self.assertEqual(reader.readline(keepends=False), u"foo")
    204         writer.write(u"\nbar\r")
    205         self.assertEqual(reader.readline(keepends=False), u"")
    206         self.assertEqual(reader.readline(keepends=False), u"bar")
    207         writer.write(u"baz")
    208         self.assertEqual(reader.readline(keepends=False), u"baz")
    209         self.assertEqual(reader.readline(keepends=False), u"")
    210 
    211         # Lineends
    212         writer.write(u"foo\r")
    213         self.assertEqual(reader.readline(keepends=True), u"foo\r")
    214         writer.write(u"\nbar\r")
    215         self.assertEqual(reader.readline(keepends=True), u"\n")
    216         self.assertEqual(reader.readline(keepends=True), u"bar\r")
    217         writer.write(u"baz")
    218         self.assertEqual(reader.readline(keepends=True), u"baz")
    219         self.assertEqual(reader.readline(keepends=True), u"")
    220         writer.write(u"foo\r\n")
    221         self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
    222 
    223     def test_bug1098990_a(self):
    224         s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
    225         s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
    226         s3 = u"next line.\r\n"
    227 
    228         s = (s1+s2+s3).encode(self.encoding)
    229         stream = StringIO.StringIO(s)
    230         reader = codecs.getreader(self.encoding)(stream)
    231         self.assertEqual(reader.readline(), s1)
    232         self.assertEqual(reader.readline(), s2)
    233         self.assertEqual(reader.readline(), s3)
    234         self.assertEqual(reader.readline(), u"")
    235 
    236     def test_bug1098990_b(self):
    237         s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
    238         s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
    239         s3 = u"stillokay:bbbbxx\r\n"
    240         s4 = u"broken!!!!badbad\r\n"
    241         s5 = u"againokay.\r\n"
    242 
    243         s = (s1+s2+s3+s4+s5).encode(self.encoding)
    244         stream = StringIO.StringIO(s)
    245         reader = codecs.getreader(self.encoding)(stream)
    246         self.assertEqual(reader.readline(), s1)
    247         self.assertEqual(reader.readline(), s2)
    248         self.assertEqual(reader.readline(), s3)
    249         self.assertEqual(reader.readline(), s4)
    250         self.assertEqual(reader.readline(), s5)
    251         self.assertEqual(reader.readline(), u"")
    252 
    253 class UTF32Test(ReadTest):
    254     encoding = "utf-32"
    255 
    256     spamle = ('\xff\xfe\x00\x00'
    257               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
    258               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
    259     spambe = ('\x00\x00\xfe\xff'
    260               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
    261               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
    262 
    263     def test_only_one_bom(self):
    264         _,_,reader,writer = codecs.lookup(self.encoding)
    265         # encode some stream
    266         s = StringIO.StringIO()
    267         f = writer(s)
    268         f.write(u"spam")
    269         f.write(u"spam")
    270         d = s.getvalue()
    271         # check whether there is exactly one BOM in it
    272         self.assertTrue(d == self.spamle or d == self.spambe)
    273         # try to read it back
    274         s = StringIO.StringIO(d)
    275         f = reader(s)
    276         self.assertEqual(f.read(), u"spamspam")
    277 
    278     def test_badbom(self):
    279         s = StringIO.StringIO(4*"\xff")
    280         f = codecs.getreader(self.encoding)(s)
    281         self.assertRaises(UnicodeError, f.read)
    282 
    283         s = StringIO.StringIO(8*"\xff")
    284         f = codecs.getreader(self.encoding)(s)
    285         self.assertRaises(UnicodeError, f.read)
    286 
    287     def test_partial(self):
    288         self.check_partial(
    289             u"\x00\xff\u0100\uffff\U00010000",
    290             [
    291                 u"", # first byte of BOM read
    292                 u"", # second byte of BOM read
    293                 u"", # third byte of BOM read
    294                 u"", # fourth byte of BOM read => byteorder known
    295                 u"",
    296                 u"",
    297                 u"",
    298                 u"\x00",
    299                 u"\x00",
    300                 u"\x00",
    301                 u"\x00",
    302                 u"\x00\xff",
    303                 u"\x00\xff",
    304                 u"\x00\xff",
    305                 u"\x00\xff",
    306                 u"\x00\xff\u0100",
    307                 u"\x00\xff\u0100",
    308                 u"\x00\xff\u0100",
    309                 u"\x00\xff\u0100",
    310                 u"\x00\xff\u0100\uffff",
    311                 u"\x00\xff\u0100\uffff",
    312                 u"\x00\xff\u0100\uffff",
    313                 u"\x00\xff\u0100\uffff",
    314                 u"\x00\xff\u0100\uffff\U00010000",
    315             ]
    316         )
    317 
    318     def test_handlers(self):
    319         self.assertEqual((u'\ufffd', 1),
    320                          codecs.utf_32_decode('\x01', 'replace', True))
    321         self.assertEqual((u'', 1),
    322                          codecs.utf_32_decode('\x01', 'ignore', True))
    323 
    324     def test_errors(self):
    325         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
    326                           "\xff", "strict", True)
    327 
    328     def test_issue8941(self):
    329         # Issue #8941: insufficient result allocation when decoding into
    330         # surrogate pairs on UCS-2 builds.
    331         encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
    332         self.assertEqual(u'\U00010000' * 1024,
    333                          codecs.utf_32_decode(encoded_le)[0])
    334         encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
    335         self.assertEqual(u'\U00010000' * 1024,
    336                          codecs.utf_32_decode(encoded_be)[0])
    337 
    338 class UTF32LETest(ReadTest):
    339     encoding = "utf-32-le"
    340 
    341     def test_partial(self):
    342         self.check_partial(
    343             u"\x00\xff\u0100\uffff\U00010000",
    344             [
    345                 u"",
    346                 u"",
    347                 u"",
    348                 u"\x00",
    349                 u"\x00",
    350                 u"\x00",
    351                 u"\x00",
    352                 u"\x00\xff",
    353                 u"\x00\xff",
    354                 u"\x00\xff",
    355                 u"\x00\xff",
    356                 u"\x00\xff\u0100",
    357                 u"\x00\xff\u0100",
    358                 u"\x00\xff\u0100",
    359                 u"\x00\xff\u0100",
    360                 u"\x00\xff\u0100\uffff",
    361                 u"\x00\xff\u0100\uffff",
    362                 u"\x00\xff\u0100\uffff",
    363                 u"\x00\xff\u0100\uffff",
    364                 u"\x00\xff\u0100\uffff\U00010000",
    365             ]
    366         )
    367 
    368     def test_simple(self):
    369         self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
    370 
    371     def test_errors(self):
    372         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
    373                           "\xff", "strict", True)
    374 
    375     def test_issue8941(self):
    376         # Issue #8941: insufficient result allocation when decoding into
    377         # surrogate pairs on UCS-2 builds.
    378         encoded = '\x00\x00\x01\x00' * 1024
    379         self.assertEqual(u'\U00010000' * 1024,
    380                          codecs.utf_32_le_decode(encoded)[0])
    381 
    382 class UTF32BETest(ReadTest):
    383     encoding = "utf-32-be"
    384 
    385     def test_partial(self):
    386         self.check_partial(
    387             u"\x00\xff\u0100\uffff\U00010000",
    388             [
    389                 u"",
    390                 u"",
    391                 u"",
    392                 u"\x00",
    393                 u"\x00",
    394                 u"\x00",
    395                 u"\x00",
    396                 u"\x00\xff",
    397                 u"\x00\xff",
    398                 u"\x00\xff",
    399                 u"\x00\xff",
    400                 u"\x00\xff\u0100",
    401                 u"\x00\xff\u0100",
    402                 u"\x00\xff\u0100",
    403                 u"\x00\xff\u0100",
    404                 u"\x00\xff\u0100\uffff",
    405                 u"\x00\xff\u0100\uffff",
    406                 u"\x00\xff\u0100\uffff",
    407                 u"\x00\xff\u0100\uffff",
    408                 u"\x00\xff\u0100\uffff\U00010000",
    409             ]
    410         )
    411 
    412     def test_simple(self):
    413         self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
    414 
    415     def test_errors(self):
    416         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
    417                           "\xff", "strict", True)
    418 
    419     def test_issue8941(self):
    420         # Issue #8941: insufficient result allocation when decoding into
    421         # surrogate pairs on UCS-2 builds.
    422         encoded = '\x00\x01\x00\x00' * 1024
    423         self.assertEqual(u'\U00010000' * 1024,
    424                          codecs.utf_32_be_decode(encoded)[0])
    425 
    426 
    427 class UTF16Test(ReadTest):
    428     encoding = "utf-16"
    429 
    430     spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    431     spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
    432 
    433     def test_only_one_bom(self):
    434         _,_,reader,writer = codecs.lookup(self.encoding)
    435         # encode some stream
    436         s = StringIO.StringIO()
    437         f = writer(s)
    438         f.write(u"spam")
    439         f.write(u"spam")
    440         d = s.getvalue()
    441         # check whether there is exactly one BOM in it
    442         self.assertTrue(d == self.spamle or d == self.spambe)
    443         # try to read it back
    444         s = StringIO.StringIO(d)
    445         f = reader(s)
    446         self.assertEqual(f.read(), u"spamspam")
    447 
    448     def test_badbom(self):
    449         s = StringIO.StringIO("\xff\xff")
    450         f = codecs.getreader(self.encoding)(s)
    451         self.assertRaises(UnicodeError, f.read)
    452 
    453         s = StringIO.StringIO("\xff\xff\xff\xff")
    454         f = codecs.getreader(self.encoding)(s)
    455         self.assertRaises(UnicodeError, f.read)
    456 
    457     def test_partial(self):
    458         self.check_partial(
    459             u"\x00\xff\u0100\uffff\U00010000",
    460             [
    461                 u"", # first byte of BOM read
    462                 u"", # second byte of BOM read => byteorder known
    463                 u"",
    464                 u"\x00",
    465                 u"\x00",
    466                 u"\x00\xff",
    467                 u"\x00\xff",
    468                 u"\x00\xff\u0100",
    469                 u"\x00\xff\u0100",
    470                 u"\x00\xff\u0100\uffff",
    471                 u"\x00\xff\u0100\uffff",
    472                 u"\x00\xff\u0100\uffff",
    473                 u"\x00\xff\u0100\uffff",
    474                 u"\x00\xff\u0100\uffff\U00010000",
    475             ]
    476         )
    477 
    478     def test_handlers(self):
    479         self.assertEqual((u'\ufffd', 1),
    480                          codecs.utf_16_decode('\x01', 'replace', True))
    481         self.assertEqual((u'', 1),
    482                          codecs.utf_16_decode('\x01', 'ignore', True))
    483 
    484     def test_errors(self):
    485         self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
    486 
    487     def test_bug691291(self):
    488         # Files are always opened in binary mode, even if no binary mode was
    489         # specified.  This means that no automatic conversion of '\n' is done
    490         # on reading and writing.
    491         s1 = u'Hello\r\nworld\r\n'
    492 
    493         s = s1.encode(self.encoding)
    494         self.addCleanup(test_support.unlink, test_support.TESTFN)
    495         with open(test_support.TESTFN, 'wb') as fp:
    496             fp.write(s)
    497         with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
    498             self.assertEqual(reader.read(), s1)
    499 
    500 class UTF16LETest(ReadTest):
    501     encoding = "utf-16-le"
    502 
    503     def test_partial(self):
    504         self.check_partial(
    505             u"\x00\xff\u0100\uffff\U00010000",
    506             [
    507                 u"",
    508                 u"\x00",
    509                 u"\x00",
    510                 u"\x00\xff",
    511                 u"\x00\xff",
    512                 u"\x00\xff\u0100",
    513                 u"\x00\xff\u0100",
    514                 u"\x00\xff\u0100\uffff",
    515                 u"\x00\xff\u0100\uffff",
    516                 u"\x00\xff\u0100\uffff",
    517                 u"\x00\xff\u0100\uffff",
    518                 u"\x00\xff\u0100\uffff\U00010000",
    519             ]
    520         )
    521 
    522     def test_errors(self):
    523         tests = [
    524             (b'\xff', u'\ufffd'),
    525             (b'A\x00Z', u'A\ufffd'),
    526             (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
    527             (b'\x00\xd8', u'\ufffd'),
    528             (b'\x00\xd8A', u'\ufffd'),
    529             (b'\x00\xd8A\x00', u'\ufffdA'),
    530             (b'\x00\xdcA\x00', u'\ufffdA'),
    531         ]
    532         for raw, expected in tests:
    533             self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
    534                               raw, 'strict', True)
    535             self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
    536 
    537 class UTF16BETest(ReadTest):
    538     encoding = "utf-16-be"
    539 
    540     def test_partial(self):
    541         self.check_partial(
    542             u"\x00\xff\u0100\uffff\U00010000",
    543             [
    544                 u"",
    545                 u"\x00",
    546                 u"\x00",
    547                 u"\x00\xff",
    548                 u"\x00\xff",
    549                 u"\x00\xff\u0100",
    550                 u"\x00\xff\u0100",
    551                 u"\x00\xff\u0100\uffff",
    552                 u"\x00\xff\u0100\uffff",
    553                 u"\x00\xff\u0100\uffff",
    554                 u"\x00\xff\u0100\uffff",
    555                 u"\x00\xff\u0100\uffff\U00010000",
    556             ]
    557         )
    558 
    559     def test_errors(self):
    560         tests = [
    561             (b'\xff', u'\ufffd'),
    562             (b'\x00A\xff', u'A\ufffd'),
    563             (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
    564             (b'\xd8\x00', u'\ufffd'),
    565             (b'\xd8\x00\xdc', u'\ufffd'),
    566             (b'\xd8\x00\x00A', u'\ufffdA'),
    567             (b'\xdc\x00\x00A', u'\ufffdA'),
    568         ]
    569         for raw, expected in tests:
    570             self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
    571                               raw, 'strict', True)
    572             self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
    573 
    574 class UTF8Test(ReadTest):
    575     encoding = "utf-8"
    576 
    577     def test_partial(self):
    578         self.check_partial(
    579             u"\x00\xff\u07ff\u0800\uffff\U00010000",
    580             [
    581                 u"\x00",
    582                 u"\x00",
    583                 u"\x00\xff",
    584                 u"\x00\xff",
    585                 u"\x00\xff\u07ff",
    586                 u"\x00\xff\u07ff",
    587                 u"\x00\xff\u07ff",
    588                 u"\x00\xff\u07ff\u0800",
    589                 u"\x00\xff\u07ff\u0800",
    590                 u"\x00\xff\u07ff\u0800",
    591                 u"\x00\xff\u07ff\u0800\uffff",
    592                 u"\x00\xff\u07ff\u0800\uffff",
    593                 u"\x00\xff\u07ff\u0800\uffff",
    594                 u"\x00\xff\u07ff\u0800\uffff",
    595                 u"\x00\xff\u07ff\u0800\uffff\U00010000",
    596             ]
    597         )
    598 
    599 class UTF7Test(ReadTest):
    600     encoding = "utf-7"
    601 
    602     def test_partial(self):
    603         self.check_partial(
    604             u"a+-b",
    605             [
    606                 u"a",
    607                 u"a",
    608                 u"a+",
    609                 u"a+-",
    610                 u"a+-b",
    611             ]
    612         )
    613 
    614 class UTF16ExTest(unittest.TestCase):
    615 
    616     def test_errors(self):
    617         self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
    618 
    619     def test_bad_args(self):
    620         self.assertRaises(TypeError, codecs.utf_16_ex_decode)
    621 
    622 class ReadBufferTest(unittest.TestCase):
    623 
    624     def test_array(self):
    625         import array
    626         self.assertEqual(
    627             codecs.readbuffer_encode(array.array("c", "spam")),
    628             ("spam", 4)
    629         )
    630 
    631     def test_empty(self):
    632         self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
    633 
    634     def test_bad_args(self):
    635         self.assertRaises(TypeError, codecs.readbuffer_encode)
    636         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
    637 
    638 class CharBufferTest(unittest.TestCase):
    639 
    640     def test_string(self):
    641         self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
    642 
    643     def test_empty(self):
    644         self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
    645 
    646     def test_bad_args(self):
    647         self.assertRaises(TypeError, codecs.charbuffer_encode)
    648         self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
    649 
    650 class UTF8SigTest(ReadTest):
    651     encoding = "utf-8-sig"
    652 
    653     def test_partial(self):
    654         self.check_partial(
    655             u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
    656             [
    657                 u"",
    658                 u"",
    659                 u"", # First BOM has been read and skipped
    660                 u"",
    661                 u"",
    662                 u"\ufeff", # Second BOM has been read and emitted
    663                 u"\ufeff\x00", # "\x00" read and emitted
    664                 u"\ufeff\x00", # First byte of encoded u"\xff" read
    665                 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
    666                 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
    667                 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
    668                 u"\ufeff\x00\xff\u07ff",
    669                 u"\ufeff\x00\xff\u07ff",
    670                 u"\ufeff\x00\xff\u07ff\u0800",
    671                 u"\ufeff\x00\xff\u07ff\u0800",
    672                 u"\ufeff\x00\xff\u07ff\u0800",
    673                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    674                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    675                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    676                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    677                 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
    678             ]
    679         )
    680 
    681     def test_bug1601501(self):
    682         # SF bug #1601501: check that the codec works with a buffer
    683         unicode("\xef\xbb\xbf", "utf-8-sig")
    684 
    685     def test_bom(self):
    686         d = codecs.getincrementaldecoder("utf-8-sig")()
    687         s = u"spam"
    688         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
    689 
    690     def test_stream_bom(self):
    691         unistring = u"ABC\u00A1\u2200XYZ"
    692         bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
    693 
    694         reader = codecs.getreader("utf-8-sig")
    695         for sizehint in [None] + range(1, 11) + \
    696                         [64, 128, 256, 512, 1024]:
    697             istream = reader(StringIO.StringIO(bytestring))
    698             ostream = StringIO.StringIO()
    699             while 1:
    700                 if sizehint is not None:
    701                     data = istream.read(sizehint)
    702                 else:
    703                     data = istream.read()
    704 
    705                 if not data:
    706                     break
    707                 ostream.write(data)
    708 
    709             got = ostream.getvalue()
    710             self.assertEqual(got, unistring)
    711 
    712     def test_stream_bare(self):
    713         unistring = u"ABC\u00A1\u2200XYZ"
    714         bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
    715 
    716         reader = codecs.getreader("utf-8-sig")
    717         for sizehint in [None] + range(1, 11) + \
    718                         [64, 128, 256, 512, 1024]:
    719             istream = reader(StringIO.StringIO(bytestring))
    720             ostream = StringIO.StringIO()
    721             while 1:
    722                 if sizehint is not None:
    723                     data = istream.read(sizehint)
    724                 else:
    725                     data = istream.read()
    726 
    727                 if not data:
    728                     break
    729                 ostream.write(data)
    730 
    731             got = ostream.getvalue()
    732             self.assertEqual(got, unistring)
    733 
    734 class EscapeDecodeTest(unittest.TestCase):
    735     def test_empty(self):
    736         self.assertEqual(codecs.escape_decode(""), ("", 0))
    737 
    738     def test_raw(self):
    739         decode = codecs.escape_decode
    740         for b in range(256):
    741             b = chr(b)
    742             if b != '\\':
    743                 self.assertEqual(decode(b + '0'), (b + '0', 2))
    744 
    745     def test_escape(self):
    746         decode = codecs.escape_decode
    747         check = coding_checker(self, decode)
    748         check(b"[\\\n]", b"[]")
    749         check(br'[\"]', b'["]')
    750         check(br"[\']", b"[']")
    751         check(br"[\\]", br"[\]")
    752         check(br"[\a]", b"[\x07]")
    753         check(br"[\b]", b"[\x08]")
    754         check(br"[\t]", b"[\x09]")
    755         check(br"[\n]", b"[\x0a]")
    756         check(br"[\v]", b"[\x0b]")
    757         check(br"[\f]", b"[\x0c]")
    758         check(br"[\r]", b"[\x0d]")
    759         check(br"[\7]", b"[\x07]")
    760         check(br"[\8]", br"[\8]")
    761         check(br"[\78]", b"[\x078]")
    762         check(br"[\41]", b"[!]")
    763         check(br"[\418]", b"[!8]")
    764         check(br"[\101]", b"[A]")
    765         check(br"[\1010]", b"[A0]")
    766         check(br"[\501]", b"[A]")
    767         check(br"[\x41]", b"[A]")
    768         check(br"[\X41]", br"[\X41]")
    769         check(br"[\x410]", b"[A0]")
    770         for b in range(256):
    771             b = chr(b)
    772             if b not in '\n"\'\\abtnvfr01234567x':
    773                 check('\\' + b, '\\' + b)
    774 
    775     def test_errors(self):
    776         decode = codecs.escape_decode
    777         self.assertRaises(ValueError, decode, br"\x")
    778         self.assertRaises(ValueError, decode, br"[\x]")
    779         self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
    780         self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
    781         self.assertRaises(ValueError, decode, br"\x0")
    782         self.assertRaises(ValueError, decode, br"[\x0]")
    783         self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
    784         self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
    785 
    786 class RecodingTest(unittest.TestCase):
    787     def test_recoding(self):
    788         f = StringIO.StringIO()
    789         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
    790         f2.write(u"a")
    791         f2.close()
    792         # Python used to crash on this at exit because of a refcount
    793         # bug in _codecsmodule.c
    794 
    795 # From RFC 3492
    796 punycode_testcases = [
    797     # A Arabic (Egyptian):
    798     (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
    799      u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
    800      "egbpdaj6bu4bxfgehfvwxn"),
    801     # B Chinese (simplified):
    802     (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
    803      "ihqwcrb4cv8a8dqg056pqjye"),
    804     # C Chinese (traditional):
    805     (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
    806      "ihqwctvzc91f659drss3x8bo0yb"),
    807     # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
    808     (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
    809      u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
    810      u"\u0065\u0073\u006B\u0079",
    811      "Proprostnemluvesky-uyb24dma41a"),
    812     # E Hebrew:
    813     (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
    814      u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
    815      u"\u05D1\u05E8\u05D9\u05EA",
    816      "4dbcagdahymbxekheh6e0a7fei0b"),
    817     # F Hindi (Devanagari):
    818     (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
    819     u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
    820     u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
    821     u"\u0939\u0948\u0902",
    822     "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
    823 
    824     #(G) Japanese (kanji and hiragana):
    825     (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
    826     u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
    827      "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
    828 
    829     # (H) Korean (Hangul syllables):
    830     (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
    831      u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
    832      u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
    833      "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
    834      "psd879ccm6fea98c"),
    835 
    836     # (I) Russian (Cyrillic):
    837     (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
    838      u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
    839      u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
    840      u"\u0438",
    841      "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
    842 
    843     # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
    844     (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
    845      u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
    846      u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
    847      u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
    848      u"\u0061\u00F1\u006F\u006C",
    849      "PorqunopuedensimplementehablarenEspaol-fmd56a"),
    850 
    851     # (K) Vietnamese:
    852     #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
    853     #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
    854     (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
    855      u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
    856      u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
    857      u"\u0056\u0069\u1EC7\u0074",
    858      "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
    859 
    860     #(L) 3<nen>B<gumi><kinpachi><sensei>
    861     (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
    862      "3B-ww4c5e180e575a65lsy2b"),
    863 
    864     # (M) <amuro><namie>-with-SUPER-MONKEYS
    865     (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
    866      u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
    867      u"\u004F\u004E\u004B\u0045\u0059\u0053",
    868      "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
    869 
    870     # (N) Hello-Another-Way-<sorezore><no><basho>
    871     (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
    872      u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
    873      u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
    874      "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
    875 
    876     # (O) <hitotsu><yane><no><shita>2
    877     (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
    878      "2-u9tlzr9756bt3uc0v"),
    879 
    880     # (P) Maji<de>Koi<suru>5<byou><mae>
    881     (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
    882      u"\u308B\u0035\u79D2\u524D",
    883      "MajiKoi5-783gue6qz075azm5e"),
    884 
    885      # (Q) <pafii>de<runba>
    886     (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
    887      "de-jg4avhby1noc0d"),
    888 
    889     # (R) <sono><supiido><de>
    890     (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
    891      "d9juau41awczczp"),
    892 
    893     # (S) -> $1.00 <-
    894     (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
    895      u"\u003C\u002D",
    896      "-> $1.00 <--")
    897     ]
    898 
    899 for i in punycode_testcases:
    900     if len(i)!=2:
    901         print repr(i)
    902 
    903 class PunycodeTest(unittest.TestCase):
    904     def test_encode(self):
    905         for uni, puny in punycode_testcases:
    906             # Need to convert both strings to lower case, since
    907             # some of the extended encodings use upper case, but our
    908             # code produces only lower case. Converting just puny to
    909             # lower is also insufficient, since some of the input characters
    910             # are upper case.
    911             self.assertEqual(uni.encode("punycode").lower(), puny.lower())
    912 
    913     def test_decode(self):
    914         for uni, puny in punycode_testcases:
    915             self.assertEqual(uni, puny.decode("punycode"))
    916 
    917 class UnicodeInternalTest(unittest.TestCase):
    918     def test_bug1251300(self):
    919         # Decoding with unicode_internal used to not correctly handle "code
    920         # points" above 0x10ffff on UCS-4 builds.
    921         if sys.maxunicode > 0xffff:
    922             ok = [
    923                 ("\x00\x10\xff\xff", u"\U0010ffff"),
    924                 ("\x00\x00\x01\x01", u"\U00000101"),
    925                 ("", u""),
    926             ]
    927             not_ok = [
    928                 "\x7f\xff\xff\xff",
    929                 "\x80\x00\x00\x00",
    930                 "\x81\x00\x00\x00",
    931                 "\x00",
    932                 "\x00\x00\x00\x00\x00",
    933             ]
    934             for internal, uni in ok:
    935                 if sys.byteorder == "little":
    936                     internal = "".join(reversed(internal))
    937                 self.assertEqual(uni, internal.decode("unicode_internal"))
    938             for internal in not_ok:
    939                 if sys.byteorder == "little":
    940                     internal = "".join(reversed(internal))
    941                 self.assertRaises(UnicodeDecodeError, internal.decode,
    942                     "unicode_internal")
    943 
    944     def test_decode_error_attributes(self):
    945         if sys.maxunicode > 0xffff:
    946             try:
    947                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
    948             except UnicodeDecodeError, ex:
    949                 self.assertEqual("unicode_internal", ex.encoding)
    950                 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
    951                 self.assertEqual(4, ex.start)
    952                 self.assertEqual(8, ex.end)
    953             else:
    954                 self.fail()
    955 
    956     def test_decode_callback(self):
    957         if sys.maxunicode > 0xffff:
    958             codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
    959             decoder = codecs.getdecoder("unicode_internal")
    960             ab = u"ab".encode("unicode_internal")
    961             ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
    962                 "UnicodeInternalTest")
    963             self.assertEqual((u"ab", 12), ignored)
    964 
    965     def test_encode_length(self):
    966         # Issue 3739
    967         encoder = codecs.getencoder("unicode_internal")
    968         self.assertEqual(encoder(u"a")[1], 1)
    969         self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
    970 
    971         encoder = codecs.getencoder("string-escape")
    972         self.assertEqual(encoder(r'\x00')[1], 4)
    973 
    974 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
    975 nameprep_tests = [
    976     # 3.1 Map to nothing.
    977     ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
    978      '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
    979      '\xb8\x8f\xef\xbb\xbf',
    980      'foobarbaz'),
    981     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
    982     ('CAFE',
    983      'cafe'),
    984     # 3.3 Case folding 8bit U+00DF (german sharp s).
    985     # The original test case is bogus; it says \xc3\xdf
    986     ('\xc3\x9f',
    987      'ss'),
    988     # 3.4 Case folding U+0130 (turkish capital I with dot).
    989     ('\xc4\xb0',
    990      'i\xcc\x87'),
    991     # 3.5 Case folding multibyte U+0143 U+037A.
    992     ('\xc5\x83\xcd\xba',
    993      '\xc5\x84 \xce\xb9'),
    994     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
    995     # XXX: skip this as it fails in UCS-2 mode
    996     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
    997     # 'telc\xe2\x88\x95kg\xcf\x83'),
    998     (None, None),
    999     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
   1000     ('j\xcc\x8c\xc2\xa0\xc2\xaa',
   1001      '\xc7\xb0 a'),
   1002     # 3.8 Case folding U+1FB7 and normalization.
   1003     ('\xe1\xbe\xb7',
   1004      '\xe1\xbe\xb6\xce\xb9'),
   1005     # 3.9 Self-reverting case folding U+01F0 and normalization.
   1006     # The original test case is bogus, it says `\xc7\xf0'
   1007     ('\xc7\xb0',
   1008      '\xc7\xb0'),
   1009     # 3.10 Self-reverting case folding U+0390 and normalization.
   1010     ('\xce\x90',
   1011      '\xce\x90'),
   1012     # 3.11 Self-reverting case folding U+03B0 and normalization.
   1013     ('\xce\xb0',
   1014      '\xce\xb0'),
   1015     # 3.12 Self-reverting case folding U+1E96 and normalization.
   1016     ('\xe1\xba\x96',
   1017      '\xe1\xba\x96'),
   1018     # 3.13 Self-reverting case folding U+1F56 and normalization.
   1019     ('\xe1\xbd\x96',
   1020      '\xe1\xbd\x96'),
   1021     # 3.14 ASCII space character U+0020.
   1022     (' ',
   1023      ' '),
   1024     # 3.15 Non-ASCII 8bit space character U+00A0.
   1025     ('\xc2\xa0',
   1026      ' '),
   1027     # 3.16 Non-ASCII multibyte space character U+1680.
   1028     ('\xe1\x9a\x80',
   1029      None),
   1030     # 3.17 Non-ASCII multibyte space character U+2000.
   1031     ('\xe2\x80\x80',
   1032      ' '),
   1033     # 3.18 Zero Width Space U+200b.
   1034     ('\xe2\x80\x8b',
   1035      ''),
   1036     # 3.19 Non-ASCII multibyte space character U+3000.
   1037     ('\xe3\x80\x80',
   1038      ' '),
   1039     # 3.20 ASCII control characters U+0010 U+007F.
   1040     ('\x10\x7f',
   1041      '\x10\x7f'),
   1042     # 3.21 Non-ASCII 8bit control character U+0085.
   1043     ('\xc2\x85',
   1044      None),
   1045     # 3.22 Non-ASCII multibyte control character U+180E.
   1046     ('\xe1\xa0\x8e',
   1047      None),
   1048     # 3.23 Zero Width No-Break Space U+FEFF.
   1049     ('\xef\xbb\xbf',
   1050      ''),
   1051     # 3.24 Non-ASCII control character U+1D175.
   1052     ('\xf0\x9d\x85\xb5',
   1053      None),
   1054     # 3.25 Plane 0 private use character U+F123.
   1055     ('\xef\x84\xa3',
   1056      None),
   1057     # 3.26 Plane 15 private use character U+F1234.
   1058     ('\xf3\xb1\x88\xb4',
   1059      None),
   1060     # 3.27 Plane 16 private use character U+10F234.
   1061     ('\xf4\x8f\x88\xb4',
   1062      None),
   1063     # 3.28 Non-character code point U+8FFFE.
   1064     ('\xf2\x8f\xbf\xbe',
   1065      None),
   1066     # 3.29 Non-character code point U+10FFFF.
   1067     ('\xf4\x8f\xbf\xbf',
   1068      None),
   1069     # 3.30 Surrogate code U+DF42.
   1070     ('\xed\xbd\x82',
   1071      None),
   1072     # 3.31 Non-plain text character U+FFFD.
   1073     ('\xef\xbf\xbd',
   1074      None),
   1075     # 3.32 Ideographic description character U+2FF5.
   1076     ('\xe2\xbf\xb5',
   1077      None),
   1078     # 3.33 Display property character U+0341.
   1079     ('\xcd\x81',
   1080      '\xcc\x81'),
   1081     # 3.34 Left-to-right mark U+200E.
   1082     ('\xe2\x80\x8e',
   1083      None),
   1084     # 3.35 Deprecated U+202A.
   1085     ('\xe2\x80\xaa',
   1086      None),
   1087     # 3.36 Language tagging character U+E0001.
   1088     ('\xf3\xa0\x80\x81',
   1089      None),
   1090     # 3.37 Language tagging character U+E0042.
   1091     ('\xf3\xa0\x81\x82',
   1092      None),
   1093     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
   1094     ('foo\xd6\xbebar',
   1095      None),
   1096     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
   1097     ('foo\xef\xb5\x90bar',
   1098      None),
   1099     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
   1100     ('foo\xef\xb9\xb6bar',
   1101      'foo \xd9\x8ebar'),
   1102     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
   1103     ('\xd8\xa71',
   1104      None),
   1105     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
   1106     ('\xd8\xa71\xd8\xa8',
   1107      '\xd8\xa71\xd8\xa8'),
   1108     # 3.43 Unassigned code point U+E0002.
   1109     # Skip this test as we allow unassigned
   1110     #('\xf3\xa0\x80\x82',
   1111     # None),
   1112     (None, None),
   1113     # 3.44 Larger test (shrinking).
   1114     # Original test case reads \xc3\xdf
   1115     ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
   1116      '\xaa\xce\xb0\xe2\x80\x80',
   1117      'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
   1118     # 3.45 Larger test (expanding).
   1119     # Original test case reads \xc3\x9f
   1120     ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
   1121      '\x80',
   1122      'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
   1123      '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
   1124      '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
   1125     ]
   1126 
   1127 
   1128 class NameprepTest(unittest.TestCase):
   1129     def test_nameprep(self):
   1130         from encodings.idna import nameprep
   1131         for pos, (orig, prepped) in enumerate(nameprep_tests):
   1132             if orig is None:
   1133                 # Skipped
   1134                 continue
   1135             # The Unicode strings are given in UTF-8
   1136             orig = unicode(orig, "utf-8")
   1137             if prepped is None:
   1138                 # Input contains prohibited characters
   1139                 self.assertRaises(UnicodeError, nameprep, orig)
   1140             else:
   1141                 prepped = unicode(prepped, "utf-8")
   1142                 try:
   1143                     self.assertEqual(nameprep(orig), prepped)
   1144                 except Exception,e:
   1145                     raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
   1146 
   1147 class IDNACodecTest(unittest.TestCase):
   1148     def test_builtin_decode(self):
   1149         self.assertEqual(unicode("python.org", "idna"), u"python.org")
   1150         self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
   1151         self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
   1152         self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
   1153 
   1154     def test_builtin_encode(self):
   1155         self.assertEqual(u"python.org".encode("idna"), "python.org")
   1156         self.assertEqual("python.org.".encode("idna"), "python.org.")
   1157         self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
   1158         self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
   1159 
   1160     def test_stream(self):
   1161         import StringIO
   1162         r = codecs.getreader("idna")(StringIO.StringIO("abc"))
   1163         r.read(3)
   1164         self.assertEqual(r.read(), u"")
   1165 
   1166     def test_incremental_decode(self):
   1167         self.assertEqual(
   1168             "".join(codecs.iterdecode("python.org", "idna")),
   1169             u"python.org"
   1170         )
   1171         self.assertEqual(
   1172             "".join(codecs.iterdecode("python.org.", "idna")),
   1173             u"python.org."
   1174         )
   1175         self.assertEqual(
   1176             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
   1177             u"pyth\xf6n.org."
   1178         )
   1179         self.assertEqual(
   1180             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
   1181             u"pyth\xf6n.org."
   1182         )
   1183 
   1184         decoder = codecs.getincrementaldecoder("idna")()
   1185         self.assertEqual(decoder.decode("xn--xam", ), u"")
   1186         self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
   1187         self.assertEqual(decoder.decode(u"rg"), u"")
   1188         self.assertEqual(decoder.decode(u"", True), u"org")
   1189 
   1190         decoder.reset()
   1191         self.assertEqual(decoder.decode("xn--xam", ), u"")
   1192         self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
   1193         self.assertEqual(decoder.decode("rg."), u"org.")
   1194         self.assertEqual(decoder.decode("", True), u"")
   1195 
   1196     def test_incremental_encode(self):
   1197         self.assertEqual(
   1198             "".join(codecs.iterencode(u"python.org", "idna")),
   1199             "python.org"
   1200         )
   1201         self.assertEqual(
   1202             "".join(codecs.iterencode(u"python.org.", "idna")),
   1203             "python.org."
   1204         )
   1205         self.assertEqual(
   1206             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
   1207             "xn--pythn-mua.org."
   1208         )
   1209         self.assertEqual(
   1210             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
   1211             "xn--pythn-mua.org."
   1212         )
   1213 
   1214         encoder = codecs.getincrementalencoder("idna")()
   1215         self.assertEqual(encoder.encode(u"\xe4x"), "")
   1216         self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
   1217         self.assertEqual(encoder.encode(u"", True), "org")
   1218 
   1219         encoder.reset()
   1220         self.assertEqual(encoder.encode(u"\xe4x"), "")
   1221         self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
   1222         self.assertEqual(encoder.encode(u"", True), "")
   1223 
   1224 class CodecsModuleTest(unittest.TestCase):
   1225 
   1226     def test_decode(self):
   1227         self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
   1228                           u'\xe4\xf6\xfc')
   1229         self.assertRaises(TypeError, codecs.decode)
   1230         self.assertEqual(codecs.decode('abc'), u'abc')
   1231         self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
   1232 
   1233     def test_encode(self):
   1234         self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
   1235                           '\xe4\xf6\xfc')
   1236         self.assertRaises(TypeError, codecs.encode)
   1237         self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
   1238         self.assertEqual(codecs.encode(u'abc'), 'abc')
   1239         self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
   1240 
   1241     def test_register(self):
   1242         self.assertRaises(TypeError, codecs.register)
   1243         self.assertRaises(TypeError, codecs.register, 42)
   1244 
   1245     def test_lookup(self):
   1246         self.assertRaises(TypeError, codecs.lookup)
   1247         self.assertRaises(LookupError, codecs.lookup, "__spam__")
   1248         self.assertRaises(LookupError, codecs.lookup, " ")
   1249 
   1250     def test_getencoder(self):
   1251         self.assertRaises(TypeError, codecs.getencoder)
   1252         self.assertRaises(LookupError, codecs.getencoder, "__spam__")
   1253 
   1254     def test_getdecoder(self):
   1255         self.assertRaises(TypeError, codecs.getdecoder)
   1256         self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
   1257 
   1258     def test_getreader(self):
   1259         self.assertRaises(TypeError, codecs.getreader)
   1260         self.assertRaises(LookupError, codecs.getreader, "__spam__")
   1261 
   1262     def test_getwriter(self):
   1263         self.assertRaises(TypeError, codecs.getwriter)
   1264         self.assertRaises(LookupError, codecs.getwriter, "__spam__")
   1265 
   1266     def test_lookup_issue1813(self):
   1267         # Issue #1813: under Turkish locales, lookup of some codecs failed
   1268         # because 'I' is lowercased as a dotless "i"
   1269         oldlocale = locale.getlocale(locale.LC_CTYPE)
   1270         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1271         try:
   1272             locale.setlocale(locale.LC_CTYPE, 'tr_TR')
   1273         except locale.Error:
   1274             # Unsupported locale on this system
   1275             self.skipTest('test needs Turkish locale')
   1276         c = codecs.lookup('ASCII')
   1277         self.assertEqual(c.name, 'ascii')
   1278 
   1279 class StreamReaderTest(unittest.TestCase):
   1280 
   1281     def setUp(self):
   1282         self.reader = codecs.getreader('utf-8')
   1283         self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
   1284 
   1285     def test_readlines(self):
   1286         f = self.reader(self.stream)
   1287         self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
   1288 
   1289 class EncodedFileTest(unittest.TestCase):
   1290 
   1291     def test_basic(self):
   1292         f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
   1293         ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
   1294         self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
   1295 
   1296         f = StringIO.StringIO()
   1297         ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
   1298         ef.write('\xc3\xbc')
   1299         self.assertEqual(f.getvalue(), '\xfc')
   1300 
   1301 class Str2StrTest(unittest.TestCase):
   1302 
   1303     def test_read(self):
   1304         sin = "\x80".encode("base64_codec")
   1305         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
   1306         sout = reader.read()
   1307         self.assertEqual(sout, "\x80")
   1308         self.assertIsInstance(sout, str)
   1309 
   1310     def test_readline(self):
   1311         sin = "\x80".encode("base64_codec")
   1312         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
   1313         sout = reader.readline()
   1314         self.assertEqual(sout, "\x80")
   1315         self.assertIsInstance(sout, str)
   1316 
   1317 all_unicode_encodings = [
   1318     "ascii",
   1319     "base64_codec",
   1320     "big5",
   1321     "big5hkscs",
   1322     "charmap",
   1323     "cp037",
   1324     "cp1006",
   1325     "cp1026",
   1326     "cp1140",
   1327     "cp1250",
   1328     "cp1251",
   1329     "cp1252",
   1330     "cp1253",
   1331     "cp1254",
   1332     "cp1255",
   1333     "cp1256",
   1334     "cp1257",
   1335     "cp1258",
   1336     "cp424",
   1337     "cp437",
   1338     "cp500",
   1339     "cp720",
   1340     "cp737",
   1341     "cp775",
   1342     "cp850",
   1343     "cp852",
   1344     "cp855",
   1345     "cp856",
   1346     "cp857",
   1347     "cp858",
   1348     "cp860",
   1349     "cp861",
   1350     "cp862",
   1351     "cp863",
   1352     "cp864",
   1353     "cp865",
   1354     "cp866",
   1355     "cp869",
   1356     "cp874",
   1357     "cp875",
   1358     "cp932",
   1359     "cp949",
   1360     "cp950",
   1361     "euc_jis_2004",
   1362     "euc_jisx0213",
   1363     "euc_jp",
   1364     "euc_kr",
   1365     "gb18030",
   1366     "gb2312",
   1367     "gbk",
   1368     "hex_codec",
   1369     "hp_roman8",
   1370     "hz",
   1371     "idna",
   1372     "iso2022_jp",
   1373     "iso2022_jp_1",
   1374     "iso2022_jp_2",
   1375     "iso2022_jp_2004",
   1376     "iso2022_jp_3",
   1377     "iso2022_jp_ext",
   1378     "iso2022_kr",
   1379     "iso8859_1",
   1380     "iso8859_10",
   1381     "iso8859_11",
   1382     "iso8859_13",
   1383     "iso8859_14",
   1384     "iso8859_15",
   1385     "iso8859_16",
   1386     "iso8859_2",
   1387     "iso8859_3",
   1388     "iso8859_4",
   1389     "iso8859_5",
   1390     "iso8859_6",
   1391     "iso8859_7",
   1392     "iso8859_8",
   1393     "iso8859_9",
   1394     "johab",
   1395     "koi8_r",
   1396     "koi8_u",
   1397     "latin_1",
   1398     "mac_cyrillic",
   1399     "mac_greek",
   1400     "mac_iceland",
   1401     "mac_latin2",
   1402     "mac_roman",
   1403     "mac_turkish",
   1404     "palmos",
   1405     "ptcp154",
   1406     "punycode",
   1407     "raw_unicode_escape",
   1408     "rot_13",
   1409     "shift_jis",
   1410     "shift_jis_2004",
   1411     "shift_jisx0213",
   1412     "tis_620",
   1413     "unicode_escape",
   1414     "unicode_internal",
   1415     "utf_16",
   1416     "utf_16_be",
   1417     "utf_16_le",
   1418     "utf_7",
   1419     "utf_8",
   1420 ]
   1421 
   1422 if hasattr(codecs, "mbcs_encode"):
   1423     all_unicode_encodings.append("mbcs")
   1424 
   1425 # The following encodings work only with str, not unicode
   1426 all_string_encodings = [
   1427     "quopri_codec",
   1428     "string_escape",
   1429     "uu_codec",
   1430 ]
   1431 
   1432 # The following encoding is not tested, because it's not supposed
   1433 # to work:
   1434 #    "undefined"
   1435 
   1436 # The following encodings don't work in stateful mode
   1437 broken_unicode_with_streams = [
   1438     "base64_codec",
   1439     "hex_codec",
   1440     "punycode",
   1441     "unicode_internal"
   1442 ]
   1443 broken_incremental_coders = broken_unicode_with_streams[:]
   1444 
   1445 # The following encodings only support "strict" mode
   1446 only_strict_mode = [
   1447     "idna",
   1448     "zlib_codec",
   1449     "bz2_codec",
   1450 ]
   1451 
   1452 try:
   1453     import bz2
   1454 except ImportError:
   1455     pass
   1456 else:
   1457     all_unicode_encodings.append("bz2_codec")
   1458     broken_unicode_with_streams.append("bz2_codec")
   1459 
   1460 try:
   1461     import zlib
   1462 except ImportError:
   1463     pass
   1464 else:
   1465     all_unicode_encodings.append("zlib_codec")
   1466     broken_unicode_with_streams.append("zlib_codec")
   1467 
   1468 class BasicUnicodeTest(unittest.TestCase):
   1469     def test_basics(self):
   1470         s = u"abc123" # all codecs should be able to encode these
   1471         for encoding in all_unicode_encodings:
   1472             name = codecs.lookup(encoding).name
   1473             if encoding.endswith("_codec"):
   1474                 name += "_codec"
   1475             elif encoding == "latin_1":
   1476                 name = "latin_1"
   1477             self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
   1478             (bytes, size) = codecs.getencoder(encoding)(s)
   1479             self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
   1480             (chars, size) = codecs.getdecoder(encoding)(bytes)
   1481             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
   1482 
   1483             if encoding not in broken_unicode_with_streams:
   1484                 # check stream reader/writer
   1485                 q = Queue()
   1486                 writer = codecs.getwriter(encoding)(q)
   1487                 encodedresult = ""
   1488                 for c in s:
   1489                     writer.write(c)
   1490                     encodedresult += q.read()
   1491                 q = Queue()
   1492                 reader = codecs.getreader(encoding)(q)
   1493                 decodedresult = u""
   1494                 for c in encodedresult:
   1495                     q.write(c)
   1496                     decodedresult += reader.read()
   1497                 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
   1498 
   1499             if encoding not in broken_incremental_coders:
   1500                 # check incremental decoder/encoder (fetched via the Python
   1501                 # and C API) and iterencode()/iterdecode()
   1502                 try:
   1503                     encoder = codecs.getincrementalencoder(encoding)()
   1504                     cencoder = _testcapi.codec_incrementalencoder(encoding)
   1505                 except LookupError: # no IncrementalEncoder
   1506                     pass
   1507                 else:
   1508                     # check incremental decoder/encoder
   1509                     encodedresult = ""
   1510                     for c in s:
   1511                         encodedresult += encoder.encode(c)
   1512                     encodedresult += encoder.encode(u"", True)
   1513                     decoder = codecs.getincrementaldecoder(encoding)()
   1514                     decodedresult = u""
   1515                     for c in encodedresult:
   1516                         decodedresult += decoder.decode(c)
   1517                     decodedresult += decoder.decode("", True)
   1518                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
   1519 
   1520                     # check C API
   1521                     encodedresult = ""
   1522                     for c in s:
   1523                         encodedresult += cencoder.encode(c)
   1524                     encodedresult += cencoder.encode(u"", True)
   1525                     cdecoder = _testcapi.codec_incrementaldecoder(encoding)
   1526                     decodedresult = u""
   1527                     for c in encodedresult:
   1528                         decodedresult += cdecoder.decode(c)
   1529                     decodedresult += cdecoder.decode("", True)
   1530                     self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
   1531 
   1532                     # check iterencode()/iterdecode()
   1533                     result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
   1534                     self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
   1535 
   1536                     # check iterencode()/iterdecode() with empty string
   1537                     result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
   1538                     self.assertEqual(result, u"")
   1539 
   1540                 if encoding not in only_strict_mode:
   1541                     # check incremental decoder/encoder with errors argument
   1542                     try:
   1543                         encoder = codecs.getincrementalencoder(encoding)("ignore")
   1544                         cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
   1545                     except LookupError: # no IncrementalEncoder
   1546                         pass
   1547                     else:
   1548                         encodedresult = "".join(encoder.encode(c) for c in s)
   1549                         decoder = codecs.getincrementaldecoder(encoding)("ignore")
   1550                         decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
   1551                         self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
   1552 
   1553                         encodedresult = "".join(cencoder.encode(c) for c in s)
   1554                         cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
   1555                         decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
   1556                         self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
   1557 
   1558     def test_seek(self):
   1559         # all codecs should be able to encode these
   1560         s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
   1561         for encoding in all_unicode_encodings:
   1562             if encoding == "idna": # FIXME: See SF bug #1163178
   1563                 continue
   1564             if encoding in broken_unicode_with_streams:
   1565                 continue
   1566             reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
   1567             for t in xrange(5):
   1568                 # Test that calling seek resets the internal codec state and buffers
   1569                 reader.seek(0, 0)
   1570                 line = reader.readline()
   1571                 self.assertEqual(s[:len(line)], line)
   1572 
   1573     def test_bad_decode_args(self):
   1574         for encoding in all_unicode_encodings:
   1575             decoder = codecs.getdecoder(encoding)
   1576             self.assertRaises(TypeError, decoder)
   1577             if encoding not in ("idna", "punycode"):
   1578                 self.assertRaises(TypeError, decoder, 42)
   1579 
   1580     def test_bad_encode_args(self):
   1581         for encoding in all_unicode_encodings:
   1582             encoder = codecs.getencoder(encoding)
   1583             self.assertRaises(TypeError, encoder)
   1584 
   1585     def test_encoding_map_type_initialized(self):
   1586         from encodings import cp1140
   1587         # This used to crash, we are only verifying there's no crash.
   1588         table_type = type(cp1140.encoding_table)
   1589         self.assertEqual(table_type, table_type)
   1590 
   1591 class BasicStrTest(unittest.TestCase):
   1592     def test_basics(self):
   1593         s = "abc123"
   1594         for encoding in all_string_encodings:
   1595             (bytes, size) = codecs.getencoder(encoding)(s)
   1596             self.assertEqual(size, len(s))
   1597             (chars, size) = codecs.getdecoder(encoding)(bytes)
   1598             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
   1599 
   1600 class CharmapTest(unittest.TestCase):
   1601     def test_decode_with_string_map(self):
   1602         self.assertEqual(
   1603             codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
   1604             (u"abc", 3)
   1605         )
   1606 
   1607         self.assertRaises(UnicodeDecodeError,
   1608             codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
   1609         )
   1610 
   1611         self.assertRaises(UnicodeDecodeError,
   1612             codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
   1613         )
   1614 
   1615         self.assertEqual(
   1616             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
   1617             (u"ab\ufffd", 3)
   1618         )
   1619 
   1620         self.assertEqual(
   1621             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
   1622             (u"ab\ufffd", 3)
   1623         )
   1624 
   1625         self.assertEqual(
   1626             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
   1627             (u"ab", 3)
   1628         )
   1629 
   1630         self.assertEqual(
   1631             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
   1632             (u"ab", 3)
   1633         )
   1634 
   1635         allbytes = "".join(chr(i) for i in xrange(256))
   1636         self.assertEqual(
   1637             codecs.charmap_decode(allbytes, "ignore", u""),
   1638             (u"", len(allbytes))
   1639         )
   1640 
   1641     def test_decode_with_int2str_map(self):
   1642         self.assertEqual(
   1643             codecs.charmap_decode("\x00\x01\x02", "strict",
   1644                                   {0: u'a', 1: u'b', 2: u'c'}),
   1645             (u"abc", 3)
   1646         )
   1647 
   1648         self.assertEqual(
   1649             codecs.charmap_decode("\x00\x01\x02", "strict",
   1650                                   {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
   1651             (u"AaBbCc", 3)
   1652         )
   1653 
   1654         self.assertEqual(
   1655             codecs.charmap_decode("\x00\x01\x02", "strict",
   1656                                   {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
   1657             (u"\U0010FFFFbc", 3)
   1658         )
   1659 
   1660         self.assertEqual(
   1661             codecs.charmap_decode("\x00\x01\x02", "strict",
   1662                                   {0: u'a', 1: u'b', 2: u''}),
   1663             (u"ab", 3)
   1664         )
   1665 
   1666         self.assertRaises(UnicodeDecodeError,
   1667             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1668                                    {0: u'a', 1: u'b'}
   1669         )
   1670 
   1671         self.assertRaises(UnicodeDecodeError,
   1672             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1673                                    {0: u'a', 1: u'b', 2: None}
   1674         )
   1675 
   1676         # Issue #14850
   1677         self.assertRaises(UnicodeDecodeError,
   1678             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1679                                    {0: u'a', 1: u'b', 2: u'\ufffe'}
   1680         )
   1681 
   1682         self.assertEqual(
   1683             codecs.charmap_decode("\x00\x01\x02", "replace",
   1684                                   {0: u'a', 1: u'b'}),
   1685             (u"ab\ufffd", 3)
   1686         )
   1687 
   1688         self.assertEqual(
   1689             codecs.charmap_decode("\x00\x01\x02", "replace",
   1690                                   {0: u'a', 1: u'b', 2: None}),
   1691             (u"ab\ufffd", 3)
   1692         )
   1693 
   1694         # Issue #14850
   1695         self.assertEqual(
   1696             codecs.charmap_decode("\x00\x01\x02", "replace",
   1697                                   {0: u'a', 1: u'b', 2: u'\ufffe'}),
   1698             (u"ab\ufffd", 3)
   1699         )
   1700 
   1701         self.assertEqual(
   1702             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1703                                   {0: u'a', 1: u'b'}),
   1704             (u"ab", 3)
   1705         )
   1706 
   1707         self.assertEqual(
   1708             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1709                                   {0: u'a', 1: u'b', 2: None}),
   1710             (u"ab", 3)
   1711         )
   1712 
   1713         # Issue #14850
   1714         self.assertEqual(
   1715             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1716                                   {0: u'a', 1: u'b', 2: u'\ufffe'}),
   1717             (u"ab", 3)
   1718         )
   1719 
   1720         allbytes = "".join(chr(i) for i in xrange(256))
   1721         self.assertEqual(
   1722             codecs.charmap_decode(allbytes, "ignore", {}),
   1723             (u"", len(allbytes))
   1724         )
   1725 
   1726     def test_decode_with_int2int_map(self):
   1727         a = ord(u'a')
   1728         b = ord(u'b')
   1729         c = ord(u'c')
   1730 
   1731         self.assertEqual(
   1732             codecs.charmap_decode("\x00\x01\x02", "strict",
   1733                                   {0: a, 1: b, 2: c}),
   1734             (u"abc", 3)
   1735         )
   1736 
   1737         # Issue #15379
   1738         self.assertEqual(
   1739             codecs.charmap_decode("\x00\x01\x02", "strict",
   1740                                   {0: 0x10FFFF, 1: b, 2: c}),
   1741             (u"\U0010FFFFbc", 3)
   1742         )
   1743 
   1744         self.assertRaises(TypeError,
   1745             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1746                                    {0: 0x110000, 1: b, 2: c}
   1747         )
   1748 
   1749         self.assertRaises(UnicodeDecodeError,
   1750             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1751                                    {0: a, 1: b},
   1752         )
   1753 
   1754         self.assertRaises(UnicodeDecodeError,
   1755             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1756                                    {0: a, 1: b, 2: 0xFFFE},
   1757         )
   1758 
   1759         self.assertEqual(
   1760             codecs.charmap_decode("\x00\x01\x02", "replace",
   1761                                   {0: a, 1: b}),
   1762             (u"ab\ufffd", 3)
   1763         )
   1764 
   1765         self.assertEqual(
   1766             codecs.charmap_decode("\x00\x01\x02", "replace",
   1767                                   {0: a, 1: b, 2: 0xFFFE}),
   1768             (u"ab\ufffd", 3)
   1769         )
   1770 
   1771         self.assertEqual(
   1772             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1773                                   {0: a, 1: b}),
   1774             (u"ab", 3)
   1775         )
   1776 
   1777         self.assertEqual(
   1778             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1779                                   {0: a, 1: b, 2: 0xFFFE}),
   1780             (u"ab", 3)
   1781         )
   1782 
   1783 
   1784 class WithStmtTest(unittest.TestCase):
   1785     def test_encodedfile(self):
   1786         f = StringIO.StringIO("\xc3\xbc")
   1787         with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
   1788             self.assertEqual(ef.read(), "\xfc")
   1789 
   1790     def test_streamreaderwriter(self):
   1791         f = StringIO.StringIO("\xc3\xbc")
   1792         info = codecs.lookup("utf-8")
   1793         with codecs.StreamReaderWriter(f, info.streamreader,
   1794                                        info.streamwriter, 'strict') as srw:
   1795             self.assertEqual(srw.read(), u"\xfc")
   1796 
   1797 
   1798 class UnicodeEscapeTest(unittest.TestCase):
   1799     def test_empty(self):
   1800         self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
   1801         self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
   1802 
   1803     def test_raw_encode(self):
   1804         encode = codecs.unicode_escape_encode
   1805         for b in range(32, 127):
   1806             if b != ord('\\'):
   1807                 self.assertEqual(encode(unichr(b)), (chr(b), 1))
   1808 
   1809     def test_raw_decode(self):
   1810         decode = codecs.unicode_escape_decode
   1811         for b in range(256):
   1812             if b != ord('\\'):
   1813                 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
   1814 
   1815     def test_escape_encode(self):
   1816         encode = codecs.unicode_escape_encode
   1817         check = coding_checker(self, encode)
   1818         check(u'\t', r'\t')
   1819         check(u'\n', r'\n')
   1820         check(u'\r', r'\r')
   1821         check(u'\\', r'\\')
   1822         for b in range(32):
   1823             if chr(b) not in '\t\n\r':
   1824                 check(unichr(b), '\\x%02x' % b)
   1825         for b in range(127, 256):
   1826             check(unichr(b), '\\x%02x' % b)
   1827         check(u'\u20ac', r'\u20ac')
   1828         check(u'\U0001d120', r'\U0001d120')
   1829 
   1830     def test_escape_decode(self):
   1831         decode = codecs.unicode_escape_decode
   1832         check = coding_checker(self, decode)
   1833         check("[\\\n]", u"[]")
   1834         check(r'[\"]', u'["]')
   1835         check(r"[\']", u"[']")
   1836         check(r"[\\]", ur"[\]")
   1837         check(r"[\a]", u"[\x07]")
   1838         check(r"[\b]", u"[\x08]")
   1839         check(r"[\t]", u"[\x09]")
   1840         check(r"[\n]", u"[\x0a]")
   1841         check(r"[\v]", u"[\x0b]")
   1842         check(r"[\f]", u"[\x0c]")
   1843         check(r"[\r]", u"[\x0d]")
   1844         check(r"[\7]", u"[\x07]")
   1845         check(r"[\8]", ur"[\8]")
   1846         check(r"[\78]", u"[\x078]")
   1847         check(r"[\41]", u"[!]")
   1848         check(r"[\418]", u"[!8]")
   1849         check(r"[\101]", u"[A]")
   1850         check(r"[\1010]", u"[A0]")
   1851         check(r"[\x41]", u"[A]")
   1852         check(r"[\x410]", u"[A0]")
   1853         check(r"\u20ac", u"\u20ac")
   1854         check(r"\U0001d120", u"\U0001d120")
   1855         for b in range(256):
   1856             if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
   1857                 check('\\' + chr(b), u'\\' + unichr(b))
   1858 
   1859     def test_decode_errors(self):
   1860         decode = codecs.unicode_escape_decode
   1861         for c, d in ('x', 2), ('u', 4), ('U', 4):
   1862             for i in range(d):
   1863                 self.assertRaises(UnicodeDecodeError, decode,
   1864                                   "\\" + c + "0"*i)
   1865                 self.assertRaises(UnicodeDecodeError, decode,
   1866                                   "[\\" + c + "0"*i + "]")
   1867                 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
   1868                 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
   1869                 self.assertEqual(decode(data, "replace"),
   1870                                  (u"[\ufffd]\ufffd", len(data)))
   1871         self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
   1872         self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
   1873         self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
   1874 
   1875 
   1876 class RawUnicodeEscapeTest(unittest.TestCase):
   1877     def test_empty(self):
   1878         self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
   1879         self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
   1880 
   1881     def test_raw_encode(self):
   1882         encode = codecs.raw_unicode_escape_encode
   1883         for b in range(256):
   1884             self.assertEqual(encode(unichr(b)), (chr(b), 1))
   1885 
   1886     def test_raw_decode(self):
   1887         decode = codecs.raw_unicode_escape_decode
   1888         for b in range(256):
   1889             self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
   1890 
   1891     def test_escape_encode(self):
   1892         encode = codecs.raw_unicode_escape_encode
   1893         check = coding_checker(self, encode)
   1894         for b in range(256):
   1895             if chr(b) not in 'uU':
   1896                 check(u'\\' + unichr(b), '\\' + chr(b))
   1897         check(u'\u20ac', r'\u20ac')
   1898         check(u'\U0001d120', r'\U0001d120')
   1899 
   1900     def test_escape_decode(self):
   1901         decode = codecs.raw_unicode_escape_decode
   1902         check = coding_checker(self, decode)
   1903         for b in range(256):
   1904             if chr(b) not in 'uU':
   1905                 check('\\' + chr(b), u'\\' + unichr(b))
   1906         check(r"\u20ac", u"\u20ac")
   1907         check(r"\U0001d120", u"\U0001d120")
   1908 
   1909     def test_decode_errors(self):
   1910         decode = codecs.raw_unicode_escape_decode
   1911         for c, d in ('u', 4), ('U', 4):
   1912             for i in range(d):
   1913                 self.assertRaises(UnicodeDecodeError, decode,
   1914                                   "\\" + c + "0"*i)
   1915                 self.assertRaises(UnicodeDecodeError, decode,
   1916                                   "[\\" + c + "0"*i + "]")
   1917                 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
   1918                 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
   1919                 self.assertEqual(decode(data, "replace"),
   1920                                  (u"[\ufffd]\ufffd", len(data)))
   1921         self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
   1922         self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
   1923         self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
   1924 
   1925 
   1926 class BomTest(unittest.TestCase):
   1927     def test_seek0(self):
   1928         data = u"1234567890"
   1929         tests = ("utf-16",
   1930                  "utf-16-le",
   1931                  "utf-16-be",
   1932                  "utf-32",
   1933                  "utf-32-le",
   1934                  "utf-32-be")
   1935         self.addCleanup(test_support.unlink, test_support.TESTFN)
   1936         for encoding in tests:
   1937             # Check if the BOM is written only once
   1938             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   1939                 f.write(data)
   1940                 f.write(data)
   1941                 f.seek(0)
   1942                 self.assertEqual(f.read(), data * 2)
   1943                 f.seek(0)
   1944                 self.assertEqual(f.read(), data * 2)
   1945 
   1946             # Check that the BOM is written after a seek(0)
   1947             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   1948                 f.write(data[0])
   1949                 self.assertNotEqual(f.tell(), 0)
   1950                 f.seek(0)
   1951                 f.write(data)
   1952                 f.seek(0)
   1953                 self.assertEqual(f.read(), data)
   1954 
   1955             # (StreamWriter) Check that the BOM is written after a seek(0)
   1956             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   1957                 f.writer.write(data[0])
   1958                 self.assertNotEqual(f.writer.tell(), 0)
   1959                 f.writer.seek(0)
   1960                 f.writer.write(data)
   1961                 f.seek(0)
   1962                 self.assertEqual(f.read(), data)
   1963 
   1964             # Check that the BOM is not written after a seek() at a position
   1965             # different than the start
   1966             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   1967                 f.write(data)
   1968                 f.seek(f.tell())
   1969                 f.write(data)
   1970                 f.seek(0)
   1971                 self.assertEqual(f.read(), data * 2)
   1972 
   1973             # (StreamWriter) Check that the BOM is not written after a seek()
   1974             # at a position different than the start
   1975             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   1976                 f.writer.write(data)
   1977                 f.writer.seek(f.writer.tell())
   1978                 f.writer.write(data)
   1979                 f.seek(0)
   1980                 self.assertEqual(f.read(), data * 2)
   1981 
   1982 
   1983 def test_main():
   1984     test_support.run_unittest(
   1985         UTF32Test,
   1986         UTF32LETest,
   1987         UTF32BETest,
   1988         UTF16Test,
   1989         UTF16LETest,
   1990         UTF16BETest,
   1991         UTF8Test,
   1992         UTF8SigTest,
   1993         UTF7Test,
   1994         UTF16ExTest,
   1995         ReadBufferTest,
   1996         CharBufferTest,
   1997         EscapeDecodeTest,
   1998         RecodingTest,
   1999         PunycodeTest,
   2000         UnicodeInternalTest,
   2001         NameprepTest,
   2002         IDNACodecTest,
   2003         CodecsModuleTest,
   2004         StreamReaderTest,
   2005         EncodedFileTest,
   2006         Str2StrTest,
   2007         BasicUnicodeTest,
   2008         BasicStrTest,
   2009         CharmapTest,
   2010         WithStmtTest,
   2011         UnicodeEscapeTest,
   2012         RawUnicodeEscapeTest,
   2013         BomTest,
   2014     )
   2015 
   2016 
   2017 if __name__ == "__main__":
   2018     test_main()
   2019