Home | History | Annotate | Download | only in test
      1 import codecs
      2 import contextlib
      3 import io
      4 import locale
      5 import sys
      6 import unittest
      7 import encodings
      8 from unittest import mock
      9 
     10 from test import support
     11 
     12 try:
     13     import ctypes
     14 except ImportError:
     15     ctypes = None
     16     SIZEOF_WCHAR_T = -1
     17 else:
     18     SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
     19 
     20 def coding_checker(self, coder):
     21     def check(input, expect):
     22         self.assertEqual(coder(input), (expect, len(input)))
     23     return check
     24 
     25 
     26 class Queue(object):
     27     """
     28     queue: write bytes at one end, read bytes from the other end
     29     """
     30     def __init__(self, buffer):
     31         self._buffer = buffer
     32 
     33     def write(self, chars):
     34         self._buffer += chars
     35 
     36     def read(self, size=-1):
     37         if size<0:
     38             s = self._buffer
     39             self._buffer = self._buffer[:0] # make empty
     40             return s
     41         else:
     42             s = self._buffer[:size]
     43             self._buffer = self._buffer[size:]
     44             return s
     45 
     46 
     47 class MixInCheckStateHandling:
     48     def check_state_handling_decode(self, encoding, u, s):
     49         for i in range(len(s)+1):
     50             d = codecs.getincrementaldecoder(encoding)()
     51             part1 = d.decode(s[:i])
     52             state = d.getstate()
     53             self.assertIsInstance(state[1], int)
     54             # Check that the condition stated in the documentation for
     55             # IncrementalDecoder.getstate() holds
     56             if not state[1]:
     57                 # reset decoder to the default state without anything buffered
     58                 d.setstate((state[0][:0], 0))
     59                 # Feeding the previous input may not produce any output
     60                 self.assertTrue(not d.decode(state[0]))
     61                 # The decoder must return to the same state
     62                 self.assertEqual(state, d.getstate())
     63             # Create a new decoder and set it to the state
     64             # we extracted from the old one
     65             d = codecs.getincrementaldecoder(encoding)()
     66             d.setstate(state)
     67             part2 = d.decode(s[i:], True)
     68             self.assertEqual(u, part1+part2)
     69 
     70     def check_state_handling_encode(self, encoding, u, s):
     71         for i in range(len(u)+1):
     72             d = codecs.getincrementalencoder(encoding)()
     73             part1 = d.encode(u[:i])
     74             state = d.getstate()
     75             d = codecs.getincrementalencoder(encoding)()
     76             d.setstate(state)
     77             part2 = d.encode(u[i:], True)
     78             self.assertEqual(s, part1+part2)
     79 
     80 
     81 class ReadTest(MixInCheckStateHandling):
     82     def check_partial(self, input, partialresults):
     83         # get a StreamReader for the encoding and feed the bytestring version
     84         # of input to the reader byte by byte. Read everything available from
     85         # the StreamReader and check that the results equal the appropriate
     86         # entries from partialresults.
     87         q = Queue(b"")
     88         r = codecs.getreader(self.encoding)(q)
     89         result = ""
     90         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     91             q.write(bytes([c]))
     92             result += r.read()
     93             self.assertEqual(result, partialresult)
     94         # check that there's nothing left in the buffers
     95         self.assertEqual(r.read(), "")
     96         self.assertEqual(r.bytebuffer, b"")
     97 
     98         # do the check again, this time using an incremental decoder
     99         d = codecs.getincrementaldecoder(self.encoding)()
    100         result = ""
    101         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
    102             result += d.decode(bytes([c]))
    103             self.assertEqual(result, partialresult)
    104         # check that there's nothing left in the buffers
    105         self.assertEqual(d.decode(b"", True), "")
    106         self.assertEqual(d.buffer, b"")
    107 
    108         # Check whether the reset method works properly
    109         d.reset()
    110         result = ""
    111         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
    112             result += d.decode(bytes([c]))
    113             self.assertEqual(result, partialresult)
    114         # check that there's nothing left in the buffers
    115         self.assertEqual(d.decode(b"", True), "")
    116         self.assertEqual(d.buffer, b"")
    117 
    118         # check iterdecode()
    119         encoded = input.encode(self.encoding)
    120         self.assertEqual(
    121             input,
    122             "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
    123         )
    124 
    125     def test_readline(self):
    126         def getreader(input):
    127             stream = io.BytesIO(input.encode(self.encoding))
    128             return codecs.getreader(self.encoding)(stream)
    129 
    130         def readalllines(input, keepends=True, size=None):
    131             reader = getreader(input)
    132             lines = []
    133             while True:
    134                 line = reader.readline(size=size, keepends=keepends)
    135                 if not line:
    136                     break
    137                 lines.append(line)
    138             return "|".join(lines)
    139 
    140         s = "foo\nbar\r\nbaz\rspam\u2028eggs"
    141         sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
    142         sexpectednoends = "foo|bar|baz|spam|eggs"
    143         self.assertEqual(readalllines(s, True), sexpected)
    144         self.assertEqual(readalllines(s, False), sexpectednoends)
    145         self.assertEqual(readalllines(s, True, 10), sexpected)
    146         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
    147 
    148         lineends = ("\n", "\r\n", "\r", "\u2028")
    149         # Test long lines (multiple calls to read() in readline())
    150         vw = []
    151         vwo = []
    152         for (i, lineend) in enumerate(lineends):
    153             vw.append((i*200+200)*"\u3042" + lineend)
    154             vwo.append((i*200+200)*"\u3042")
    155         self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
    156         self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
    157 
    158         # Test lines where the first read might end with \r, so the
    159         # reader has to look ahead whether this is a lone \r or a \r\n
    160         for size in range(80):
    161             for lineend in lineends:
    162                 s = 10*(size*"a" + lineend + "xxx\n")
    163                 reader = getreader(s)
    164                 for i in range(10):
    165                     self.assertEqual(
    166                         reader.readline(keepends=True),
    167                         size*"a" + lineend,
    168                     )
    169                     self.assertEqual(
    170                         reader.readline(keepends=True),
    171                         "xxx\n",
    172                     )
    173                 reader = getreader(s)
    174                 for i in range(10):
    175                     self.assertEqual(
    176                         reader.readline(keepends=False),
    177                         size*"a",
    178                     )
    179                     self.assertEqual(
    180                         reader.readline(keepends=False),
    181                         "xxx",
    182                     )
    183 
    184     def test_mixed_readline_and_read(self):
    185         lines = ["Humpty Dumpty sat on a wall,\n",
    186                  "Humpty Dumpty had a great fall.\r\n",
    187                  "All the king's horses and all the king's men\r",
    188                  "Couldn't put Humpty together again."]
    189         data = ''.join(lines)
    190         def getreader():
    191             stream = io.BytesIO(data.encode(self.encoding))
    192             return codecs.getreader(self.encoding)(stream)
    193 
    194         # Issue #8260: Test readline() followed by read()
    195         f = getreader()
    196         self.assertEqual(f.readline(), lines[0])
    197         self.assertEqual(f.read(), ''.join(lines[1:]))
    198         self.assertEqual(f.read(), '')
    199 
    200         # Issue #32110: Test readline() followed by read(n)
    201         f = getreader()
    202         self.assertEqual(f.readline(), lines[0])
    203         self.assertEqual(f.read(1), lines[1][0])
    204         self.assertEqual(f.read(0), '')
    205         self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
    206 
    207         # Issue #16636: Test readline() followed by readlines()
    208         f = getreader()
    209         self.assertEqual(f.readline(), lines[0])
    210         self.assertEqual(f.readlines(), lines[1:])
    211         self.assertEqual(f.read(), '')
    212 
    213         # Test read(n) followed by read()
    214         f = getreader()
    215         self.assertEqual(f.read(size=40, chars=5), data[:5])
    216         self.assertEqual(f.read(), data[5:])
    217         self.assertEqual(f.read(), '')
    218 
    219         # Issue #32110: Test read(n) followed by read(n)
    220         f = getreader()
    221         self.assertEqual(f.read(size=40, chars=5), data[:5])
    222         self.assertEqual(f.read(1), data[5])
    223         self.assertEqual(f.read(0), '')
    224         self.assertEqual(f.read(100), data[6:106])
    225 
    226         # Issue #12446: Test read(n) followed by readlines()
    227         f = getreader()
    228         self.assertEqual(f.read(size=40, chars=5), data[:5])
    229         self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
    230         self.assertEqual(f.read(), '')
    231 
    232     def test_bug1175396(self):
    233         s = [
    234             '<%!--===================================================\r\n',
    235             '    BLOG index page: show recent articles,\r\n',
    236             '    today\'s articles, or articles of a specific date.\r\n',
    237             '========================================================--%>\r\n',
    238             '<%@inputencoding="ISO-8859-1"%>\r\n',
    239             '<%@pagetemplate=TEMPLATE.y%>\r\n',
    240             '<%@import=import frog.util, frog%>\r\n',
    241             '<%@import=import frog.objects%>\r\n',
    242             '<%@import=from frog.storageerrors import StorageError%>\r\n',
    243             '<%\r\n',
    244             '\r\n',
    245             'import logging\r\n',
    246             'log=logging.getLogger("Snakelets.logger")\r\n',
    247             '\r\n',
    248             '\r\n',
    249             'user=self.SessionCtx.user\r\n',
    250             'storageEngine=self.SessionCtx.storageEngine\r\n',
    251             '\r\n',
    252             '\r\n',
    253             'def readArticlesFromDate(date, count=None):\r\n',
    254             '    entryids=storageEngine.listBlogEntries(date)\r\n',
    255             '    entryids.reverse() # descending\r\n',
    256             '    if count:\r\n',
    257             '        entryids=entryids[:count]\r\n',
    258             '    try:\r\n',
    259             '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
    260             '    except StorageError,x:\r\n',
    261             '        log.error("Error loading articles: "+str(x))\r\n',
    262             '        self.abort("cannot load articles")\r\n',
    263             '\r\n',
    264             'showdate=None\r\n',
    265             '\r\n',
    266             'arg=self.Request.getArg()\r\n',
    267             'if arg=="today":\r\n',
    268             '    #-------------------- TODAY\'S ARTICLES\r\n',
    269             '    self.write("<h2>Today\'s articles</h2>")\r\n',
    270             '    showdate = frog.util.isodatestr() \r\n',
    271             '    entries = readArticlesFromDate(showdate)\r\n',
    272             'elif arg=="active":\r\n',
    273             '    #-------------------- ACTIVE ARTICLES redirect\r\n',
    274             '    self.Yredirect("active.y")\r\n',
    275             'elif arg=="login":\r\n',
    276             '    #-------------------- LOGIN PAGE redirect\r\n',
    277             '    self.Yredirect("login.y")\r\n',
    278             'elif arg=="date":\r\n',
    279             '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
    280             '    showdate = self.Request.getParameter("date")\r\n',
    281             '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
    282             '    entries = readArticlesFromDate(showdate)\r\n',
    283             'else:\r\n',
    284             '    #-------------------- RECENT ARTICLES\r\n',
    285             '    self.write("<h2>Recent articles</h2>")\r\n',
    286             '    dates=storageEngine.listBlogEntryDates()\r\n',
    287             '    if dates:\r\n',
    288             '        entries=[]\r\n',
    289             '        SHOWAMOUNT=10\r\n',
    290             '        for showdate in dates:\r\n',
    291             '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
    292             '            if len(entries)>=SHOWAMOUNT:\r\n',
    293             '                break\r\n',
    294             '                \r\n',
    295         ]
    296         stream = io.BytesIO("".join(s).encode(self.encoding))
    297         reader = codecs.getreader(self.encoding)(stream)
    298         for (i, line) in enumerate(reader):
    299             self.assertEqual(line, s[i])
    300 
    301     def test_readlinequeue(self):
    302         q = Queue(b"")
    303         writer = codecs.getwriter(self.encoding)(q)
    304         reader = codecs.getreader(self.encoding)(q)
    305 
    306         # No lineends
    307         writer.write("foo\r")
    308         self.assertEqual(reader.readline(keepends=False), "foo")
    309         writer.write("\nbar\r")
    310         self.assertEqual(reader.readline(keepends=False), "")
    311         self.assertEqual(reader.readline(keepends=False), "bar")
    312         writer.write("baz")
    313         self.assertEqual(reader.readline(keepends=False), "baz")
    314         self.assertEqual(reader.readline(keepends=False), "")
    315 
    316         # Lineends
    317         writer.write("foo\r")
    318         self.assertEqual(reader.readline(keepends=True), "foo\r")
    319         writer.write("\nbar\r")
    320         self.assertEqual(reader.readline(keepends=True), "\n")
    321         self.assertEqual(reader.readline(keepends=True), "bar\r")
    322         writer.write("baz")
    323         self.assertEqual(reader.readline(keepends=True), "baz")
    324         self.assertEqual(reader.readline(keepends=True), "")
    325         writer.write("foo\r\n")
    326         self.assertEqual(reader.readline(keepends=True), "foo\r\n")
    327 
    328     def test_bug1098990_a(self):
    329         s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
    330         s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
    331         s3 = "next line.\r\n"
    332 
    333         s = (s1+s2+s3).encode(self.encoding)
    334         stream = io.BytesIO(s)
    335         reader = codecs.getreader(self.encoding)(stream)
    336         self.assertEqual(reader.readline(), s1)
    337         self.assertEqual(reader.readline(), s2)
    338         self.assertEqual(reader.readline(), s3)
    339         self.assertEqual(reader.readline(), "")
    340 
    341     def test_bug1098990_b(self):
    342         s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
    343         s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
    344         s3 = "stillokay:bbbbxx\r\n"
    345         s4 = "broken!!!!badbad\r\n"
    346         s5 = "againokay.\r\n"
    347 
    348         s = (s1+s2+s3+s4+s5).encode(self.encoding)
    349         stream = io.BytesIO(s)
    350         reader = codecs.getreader(self.encoding)(stream)
    351         self.assertEqual(reader.readline(), s1)
    352         self.assertEqual(reader.readline(), s2)
    353         self.assertEqual(reader.readline(), s3)
    354         self.assertEqual(reader.readline(), s4)
    355         self.assertEqual(reader.readline(), s5)
    356         self.assertEqual(reader.readline(), "")
    357 
    358     ill_formed_sequence_replace = "\ufffd"
    359 
    360     def test_lone_surrogates(self):
    361         self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
    362         self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
    363                          "[\\udc80]".encode(self.encoding))
    364         self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
    365                          "[\\udc80]".encode(self.encoding))
    366         self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
    367                          "[&#56448;]".encode(self.encoding))
    368         self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
    369                          "[]".encode(self.encoding))
    370         self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
    371                          "[?]".encode(self.encoding))
    372 
    373         # sequential surrogate characters
    374         self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
    375                          "[]".encode(self.encoding))
    376         self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
    377                          "[??]".encode(self.encoding))
    378 
    379         bom = "".encode(self.encoding)
    380         for before, after in [("\U00010fff", "A"), ("[", "]"),
    381                               ("A", "\U00010fff")]:
    382             before_sequence = before.encode(self.encoding)[len(bom):]
    383             after_sequence = after.encode(self.encoding)[len(bom):]
    384             test_string = before + "\uDC80" + after
    385             test_sequence = (bom + before_sequence +
    386                              self.ill_formed_sequence + after_sequence)
    387             self.assertRaises(UnicodeDecodeError, test_sequence.decode,
    388                               self.encoding)
    389             self.assertEqual(test_string.encode(self.encoding,
    390                                                 "surrogatepass"),
    391                              test_sequence)
    392             self.assertEqual(test_sequence.decode(self.encoding,
    393                                                   "surrogatepass"),
    394                              test_string)
    395             self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
    396                              before + after)
    397             self.assertEqual(test_sequence.decode(self.encoding, "replace"),
    398                              before + self.ill_formed_sequence_replace + after)
    399             backslashreplace = ''.join('\\x%02x' % b
    400                                        for b in self.ill_formed_sequence)
    401             self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
    402                              before + backslashreplace + after)
    403 
    404 
    405 class UTF32Test(ReadTest, unittest.TestCase):
    406     encoding = "utf-32"
    407     if sys.byteorder == 'little':
    408         ill_formed_sequence = b"\x80\xdc\x00\x00"
    409     else:
    410         ill_formed_sequence = b"\x00\x00\xdc\x80"
    411 
    412     spamle = (b'\xff\xfe\x00\x00'
    413               b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
    414               b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
    415     spambe = (b'\x00\x00\xfe\xff'
    416               b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
    417               b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
    418 
    419     def test_only_one_bom(self):
    420         _,_,reader,writer = codecs.lookup(self.encoding)
    421         # encode some stream
    422         s = io.BytesIO()
    423         f = writer(s)
    424         f.write("spam")
    425         f.write("spam")
    426         d = s.getvalue()
    427         # check whether there is exactly one BOM in it
    428         self.assertTrue(d == self.spamle or d == self.spambe)
    429         # try to read it back
    430         s = io.BytesIO(d)
    431         f = reader(s)
    432         self.assertEqual(f.read(), "spamspam")
    433 
    434     def test_badbom(self):
    435         s = io.BytesIO(4*b"\xff")
    436         f = codecs.getreader(self.encoding)(s)
    437         self.assertRaises(UnicodeError, f.read)
    438 
    439         s = io.BytesIO(8*b"\xff")
    440         f = codecs.getreader(self.encoding)(s)
    441         self.assertRaises(UnicodeError, f.read)
    442 
    443     def test_partial(self):
    444         self.check_partial(
    445             "\x00\xff\u0100\uffff\U00010000",
    446             [
    447                 "", # first byte of BOM read
    448                 "", # second byte of BOM read
    449                 "", # third byte of BOM read
    450                 "", # fourth byte of BOM read => byteorder known
    451                 "",
    452                 "",
    453                 "",
    454                 "\x00",
    455                 "\x00",
    456                 "\x00",
    457                 "\x00",
    458                 "\x00\xff",
    459                 "\x00\xff",
    460                 "\x00\xff",
    461                 "\x00\xff",
    462                 "\x00\xff\u0100",
    463                 "\x00\xff\u0100",
    464                 "\x00\xff\u0100",
    465                 "\x00\xff\u0100",
    466                 "\x00\xff\u0100\uffff",
    467                 "\x00\xff\u0100\uffff",
    468                 "\x00\xff\u0100\uffff",
    469                 "\x00\xff\u0100\uffff",
    470                 "\x00\xff\u0100\uffff\U00010000",
    471             ]
    472         )
    473 
    474     def test_handlers(self):
    475         self.assertEqual(('\ufffd', 1),
    476                          codecs.utf_32_decode(b'\x01', 'replace', True))
    477         self.assertEqual(('', 1),
    478                          codecs.utf_32_decode(b'\x01', 'ignore', True))
    479 
    480     def test_errors(self):
    481         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
    482                           b"\xff", "strict", True)
    483 
    484     def test_decoder_state(self):
    485         self.check_state_handling_decode(self.encoding,
    486                                          "spamspam", self.spamle)
    487         self.check_state_handling_decode(self.encoding,
    488                                          "spamspam", self.spambe)
    489 
    490     def test_issue8941(self):
    491         # Issue #8941: insufficient result allocation when decoding into
    492         # surrogate pairs on UCS-2 builds.
    493         encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
    494         self.assertEqual('\U00010000' * 1024,
    495                          codecs.utf_32_decode(encoded_le)[0])
    496         encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
    497         self.assertEqual('\U00010000' * 1024,
    498                          codecs.utf_32_decode(encoded_be)[0])
    499 
    500 
    501 class UTF32LETest(ReadTest, unittest.TestCase):
    502     encoding = "utf-32-le"
    503     ill_formed_sequence = b"\x80\xdc\x00\x00"
    504 
    505     def test_partial(self):
    506         self.check_partial(
    507             "\x00\xff\u0100\uffff\U00010000",
    508             [
    509                 "",
    510                 "",
    511                 "",
    512                 "\x00",
    513                 "\x00",
    514                 "\x00",
    515                 "\x00",
    516                 "\x00\xff",
    517                 "\x00\xff",
    518                 "\x00\xff",
    519                 "\x00\xff",
    520                 "\x00\xff\u0100",
    521                 "\x00\xff\u0100",
    522                 "\x00\xff\u0100",
    523                 "\x00\xff\u0100",
    524                 "\x00\xff\u0100\uffff",
    525                 "\x00\xff\u0100\uffff",
    526                 "\x00\xff\u0100\uffff",
    527                 "\x00\xff\u0100\uffff",
    528                 "\x00\xff\u0100\uffff\U00010000",
    529             ]
    530         )
    531 
    532     def test_simple(self):
    533         self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
    534 
    535     def test_errors(self):
    536         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
    537                           b"\xff", "strict", True)
    538 
    539     def test_issue8941(self):
    540         # Issue #8941: insufficient result allocation when decoding into
    541         # surrogate pairs on UCS-2 builds.
    542         encoded = b'\x00\x00\x01\x00' * 1024
    543         self.assertEqual('\U00010000' * 1024,
    544                          codecs.utf_32_le_decode(encoded)[0])
    545 
    546 
    547 class UTF32BETest(ReadTest, unittest.TestCase):
    548     encoding = "utf-32-be"
    549     ill_formed_sequence = b"\x00\x00\xdc\x80"
    550 
    551     def test_partial(self):
    552         self.check_partial(
    553             "\x00\xff\u0100\uffff\U00010000",
    554             [
    555                 "",
    556                 "",
    557                 "",
    558                 "\x00",
    559                 "\x00",
    560                 "\x00",
    561                 "\x00",
    562                 "\x00\xff",
    563                 "\x00\xff",
    564                 "\x00\xff",
    565                 "\x00\xff",
    566                 "\x00\xff\u0100",
    567                 "\x00\xff\u0100",
    568                 "\x00\xff\u0100",
    569                 "\x00\xff\u0100",
    570                 "\x00\xff\u0100\uffff",
    571                 "\x00\xff\u0100\uffff",
    572                 "\x00\xff\u0100\uffff",
    573                 "\x00\xff\u0100\uffff",
    574                 "\x00\xff\u0100\uffff\U00010000",
    575             ]
    576         )
    577 
    578     def test_simple(self):
    579         self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
    580 
    581     def test_errors(self):
    582         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
    583                           b"\xff", "strict", True)
    584 
    585     def test_issue8941(self):
    586         # Issue #8941: insufficient result allocation when decoding into
    587         # surrogate pairs on UCS-2 builds.
    588         encoded = b'\x00\x01\x00\x00' * 1024
    589         self.assertEqual('\U00010000' * 1024,
    590                          codecs.utf_32_be_decode(encoded)[0])
    591 
    592 
    593 class UTF16Test(ReadTest, unittest.TestCase):
    594     encoding = "utf-16"
    595     if sys.byteorder == 'little':
    596         ill_formed_sequence = b"\x80\xdc"
    597     else:
    598         ill_formed_sequence = b"\xdc\x80"
    599 
    600     spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    601     spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
    602 
    603     def test_only_one_bom(self):
    604         _,_,reader,writer = codecs.lookup(self.encoding)
    605         # encode some stream
    606         s = io.BytesIO()
    607         f = writer(s)
    608         f.write("spam")
    609         f.write("spam")
    610         d = s.getvalue()
    611         # check whether there is exactly one BOM in it
    612         self.assertTrue(d == self.spamle or d == self.spambe)
    613         # try to read it back
    614         s = io.BytesIO(d)
    615         f = reader(s)
    616         self.assertEqual(f.read(), "spamspam")
    617 
    618     def test_badbom(self):
    619         s = io.BytesIO(b"\xff\xff")
    620         f = codecs.getreader(self.encoding)(s)
    621         self.assertRaises(UnicodeError, f.read)
    622 
    623         s = io.BytesIO(b"\xff\xff\xff\xff")
    624         f = codecs.getreader(self.encoding)(s)
    625         self.assertRaises(UnicodeError, f.read)
    626 
    627     def test_partial(self):
    628         self.check_partial(
    629             "\x00\xff\u0100\uffff\U00010000",
    630             [
    631                 "", # first byte of BOM read
    632                 "", # second byte of BOM read => byteorder known
    633                 "",
    634                 "\x00",
    635                 "\x00",
    636                 "\x00\xff",
    637                 "\x00\xff",
    638                 "\x00\xff\u0100",
    639                 "\x00\xff\u0100",
    640                 "\x00\xff\u0100\uffff",
    641                 "\x00\xff\u0100\uffff",
    642                 "\x00\xff\u0100\uffff",
    643                 "\x00\xff\u0100\uffff",
    644                 "\x00\xff\u0100\uffff\U00010000",
    645             ]
    646         )
    647 
    648     def test_handlers(self):
    649         self.assertEqual(('\ufffd', 1),
    650                          codecs.utf_16_decode(b'\x01', 'replace', True))
    651         self.assertEqual(('', 1),
    652                          codecs.utf_16_decode(b'\x01', 'ignore', True))
    653 
    654     def test_errors(self):
    655         self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
    656                           b"\xff", "strict", True)
    657 
    658     def test_decoder_state(self):
    659         self.check_state_handling_decode(self.encoding,
    660                                          "spamspam", self.spamle)
    661         self.check_state_handling_decode(self.encoding,
    662                                          "spamspam", self.spambe)
    663 
    664     def test_bug691291(self):
    665         # Files are always opened in binary mode, even if no binary mode was
    666         # specified.  This means that no automatic conversion of '\n' is done
    667         # on reading and writing.
    668         s1 = 'Hello\r\nworld\r\n'
    669 
    670         s = s1.encode(self.encoding)
    671         self.addCleanup(support.unlink, support.TESTFN)
    672         with open(support.TESTFN, 'wb') as fp:
    673             fp.write(s)
    674         with support.check_warnings(('', DeprecationWarning)):
    675             reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
    676         with reader:
    677             self.assertEqual(reader.read(), s1)
    678 
    679 class UTF16LETest(ReadTest, unittest.TestCase):
    680     encoding = "utf-16-le"
    681     ill_formed_sequence = b"\x80\xdc"
    682 
    683     def test_partial(self):
    684         self.check_partial(
    685             "\x00\xff\u0100\uffff\U00010000",
    686             [
    687                 "",
    688                 "\x00",
    689                 "\x00",
    690                 "\x00\xff",
    691                 "\x00\xff",
    692                 "\x00\xff\u0100",
    693                 "\x00\xff\u0100",
    694                 "\x00\xff\u0100\uffff",
    695                 "\x00\xff\u0100\uffff",
    696                 "\x00\xff\u0100\uffff",
    697                 "\x00\xff\u0100\uffff",
    698                 "\x00\xff\u0100\uffff\U00010000",
    699             ]
    700         )
    701 
    702     def test_errors(self):
    703         tests = [
    704             (b'\xff', '\ufffd'),
    705             (b'A\x00Z', 'A\ufffd'),
    706             (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
    707             (b'\x00\xd8', '\ufffd'),
    708             (b'\x00\xd8A', '\ufffd'),
    709             (b'\x00\xd8A\x00', '\ufffdA'),
    710             (b'\x00\xdcA\x00', '\ufffdA'),
    711         ]
    712         for raw, expected in tests:
    713             self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
    714                               raw, 'strict', True)
    715             self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
    716 
    717     def test_nonbmp(self):
    718         self.assertEqual("\U00010203".encode(self.encoding),
    719                          b'\x00\xd8\x03\xde')
    720         self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
    721                          "\U00010203")
    722 
    723 class UTF16BETest(ReadTest, unittest.TestCase):
    724     encoding = "utf-16-be"
    725     ill_formed_sequence = b"\xdc\x80"
    726 
    727     def test_partial(self):
    728         self.check_partial(
    729             "\x00\xff\u0100\uffff\U00010000",
    730             [
    731                 "",
    732                 "\x00",
    733                 "\x00",
    734                 "\x00\xff",
    735                 "\x00\xff",
    736                 "\x00\xff\u0100",
    737                 "\x00\xff\u0100",
    738                 "\x00\xff\u0100\uffff",
    739                 "\x00\xff\u0100\uffff",
    740                 "\x00\xff\u0100\uffff",
    741                 "\x00\xff\u0100\uffff",
    742                 "\x00\xff\u0100\uffff\U00010000",
    743             ]
    744         )
    745 
    746     def test_errors(self):
    747         tests = [
    748             (b'\xff', '\ufffd'),
    749             (b'\x00A\xff', 'A\ufffd'),
    750             (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
    751             (b'\xd8\x00', '\ufffd'),
    752             (b'\xd8\x00\xdc', '\ufffd'),
    753             (b'\xd8\x00\x00A', '\ufffdA'),
    754             (b'\xdc\x00\x00A', '\ufffdA'),
    755         ]
    756         for raw, expected in tests:
    757             self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
    758                               raw, 'strict', True)
    759             self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
    760 
    761     def test_nonbmp(self):
    762         self.assertEqual("\U00010203".encode(self.encoding),
    763                          b'\xd8\x00\xde\x03')
    764         self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
    765                          "\U00010203")
    766 
    767 class UTF8Test(ReadTest, unittest.TestCase):
    768     encoding = "utf-8"
    769     ill_formed_sequence = b"\xed\xb2\x80"
    770     ill_formed_sequence_replace = "\ufffd" * 3
    771     BOM = b''
    772 
    773     def test_partial(self):
    774         self.check_partial(
    775             "\x00\xff\u07ff\u0800\uffff\U00010000",
    776             [
    777                 "\x00",
    778                 "\x00",
    779                 "\x00\xff",
    780                 "\x00\xff",
    781                 "\x00\xff\u07ff",
    782                 "\x00\xff\u07ff",
    783                 "\x00\xff\u07ff",
    784                 "\x00\xff\u07ff\u0800",
    785                 "\x00\xff\u07ff\u0800",
    786                 "\x00\xff\u07ff\u0800",
    787                 "\x00\xff\u07ff\u0800\uffff",
    788                 "\x00\xff\u07ff\u0800\uffff",
    789                 "\x00\xff\u07ff\u0800\uffff",
    790                 "\x00\xff\u07ff\u0800\uffff",
    791                 "\x00\xff\u07ff\u0800\uffff\U00010000",
    792             ]
    793         )
    794 
    795     def test_decoder_state(self):
    796         u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
    797         self.check_state_handling_decode(self.encoding,
    798                                          u, u.encode(self.encoding))
    799 
    800     def test_decode_error(self):
    801         for data, error_handler, expected in (
    802             (b'[\x80\xff]', 'ignore', '[]'),
    803             (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
    804             (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
    805             (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
    806         ):
    807             with self.subTest(data=data, error_handler=error_handler,
    808                               expected=expected):
    809                 self.assertEqual(data.decode(self.encoding, error_handler),
    810                                  expected)
    811 
    812     def test_lone_surrogates(self):
    813         super().test_lone_surrogates()
    814         # not sure if this is making sense for
    815         # UTF-16 and UTF-32
    816         self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
    817                          self.BOM + b'[\x80]')
    818 
    819         with self.assertRaises(UnicodeEncodeError) as cm:
    820             "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
    821         exc = cm.exception
    822         self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
    823 
    824     def test_surrogatepass_handler(self):
    825         self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
    826                          self.BOM + b"abc\xed\xa0\x80def")
    827         self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
    828                          self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
    829         self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
    830                          self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
    831 
    832         self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
    833                          "abc\ud800def")
    834         self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
    835                          "\U00010fff\uD800")
    836 
    837         self.assertTrue(codecs.lookup_error("surrogatepass"))
    838         with self.assertRaises(UnicodeDecodeError):
    839             b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
    840         with self.assertRaises(UnicodeDecodeError):
    841             b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
    842 
    843 
    844 @unittest.skipUnless(sys.platform == 'win32',
    845                      'cp65001 is a Windows-only codec')
    846 class CP65001Test(ReadTest, unittest.TestCase):
    847     encoding = "cp65001"
    848 
    849     def test_encode(self):
    850         tests = [
    851             ('abc', 'strict', b'abc'),
    852             ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
    853             ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
    854             ('\udc80', 'strict', None),
    855             ('\udc80', 'ignore', b''),
    856             ('\udc80', 'replace', b'?'),
    857             ('\udc80', 'backslashreplace', b'\\udc80'),
    858             ('\udc80', 'namereplace', b'\\udc80'),
    859             ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
    860         ]
    861         for text, errors, expected in tests:
    862             if expected is not None:
    863                 try:
    864                     encoded = text.encode('cp65001', errors)
    865                 except UnicodeEncodeError as err:
    866                     self.fail('Unable to encode %a to cp65001 with '
    867                               'errors=%r: %s' % (text, errors, err))
    868                 self.assertEqual(encoded, expected,
    869                     '%a.encode("cp65001", %r)=%a != %a'
    870                     % (text, errors, encoded, expected))
    871             else:
    872                 self.assertRaises(UnicodeEncodeError,
    873                     text.encode, "cp65001", errors)
    874 
    875     def test_decode(self):
    876         tests = [
    877             (b'abc', 'strict', 'abc'),
    878             (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
    879             (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
    880             (b'\xef\xbf\xbd', 'strict', '\ufffd'),
    881             (b'[\xc3\xa9]', 'strict', '[\xe9]'),
    882             # invalid bytes
    883             (b'[\xff]', 'strict', None),
    884             (b'[\xff]', 'ignore', '[]'),
    885             (b'[\xff]', 'replace', '[\ufffd]'),
    886             (b'[\xff]', 'surrogateescape', '[\udcff]'),
    887             (b'[\xed\xb2\x80]', 'strict', None),
    888             (b'[\xed\xb2\x80]', 'ignore', '[]'),
    889             (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
    890         ]
    891         for raw, errors, expected in tests:
    892             if expected is not None:
    893                 try:
    894                     decoded = raw.decode('cp65001', errors)
    895                 except UnicodeDecodeError as err:
    896                     self.fail('Unable to decode %a from cp65001 with '
    897                               'errors=%r: %s' % (raw, errors, err))
    898                 self.assertEqual(decoded, expected,
    899                     '%a.decode("cp65001", %r)=%a != %a'
    900                     % (raw, errors, decoded, expected))
    901             else:
    902                 self.assertRaises(UnicodeDecodeError,
    903                     raw.decode, 'cp65001', errors)
    904 
    905     def test_lone_surrogates(self):
    906         self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
    907         self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
    908         self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
    909                          b'[\\udc80]')
    910         self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
    911                          b'[\\udc80]')
    912         self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
    913                          b'[&#56448;]')
    914         self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
    915                          b'[\x80]')
    916         self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
    917                          b'[]')
    918         self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
    919                          b'[?]')
    920 
    921     def test_surrogatepass_handler(self):
    922         self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
    923                          b"abc\xed\xa0\x80def")
    924         self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
    925                          "abc\ud800def")
    926         self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
    927                          b"\xf0\x90\xbf\xbf\xed\xa0\x80")
    928         self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
    929                          "\U00010fff\uD800")
    930         self.assertTrue(codecs.lookup_error("surrogatepass"))
    931 
    932 
    933 class UTF7Test(ReadTest, unittest.TestCase):
    934     encoding = "utf-7"
    935 
    936     def test_ascii(self):
    937         # Set D (directly encoded characters)
    938         set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    939                  'abcdefghijklmnopqrstuvwxyz'
    940                  '0123456789'
    941                  '\'(),-./:?')
    942         self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
    943         self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
    944         # Set O (optional direct characters)
    945         set_o = ' !"#$%&*;<=>@[]^_`{|}'
    946         self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
    947         self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
    948         # +
    949         self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
    950         self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
    951         # White spaces
    952         ws = ' \t\n\r'
    953         self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
    954         self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
    955         # Other ASCII characters
    956         other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
    957                                      set(set_d + set_o + '+' + ws)))
    958         self.assertEqual(other_ascii.encode(self.encoding),
    959                          b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
    960                          b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
    961 
    962     def test_partial(self):
    963         self.check_partial(
    964             'a+-b\x00c\x80d\u0100e\U00010000f',
    965             [
    966                 'a',
    967                 'a',
    968                 'a+',
    969                 'a+-',
    970                 'a+-b',
    971                 'a+-b',
    972                 'a+-b',
    973                 'a+-b',
    974                 'a+-b',
    975                 'a+-b\x00',
    976                 'a+-b\x00c',
    977                 'a+-b\x00c',
    978                 'a+-b\x00c',
    979                 'a+-b\x00c',
    980                 'a+-b\x00c',
    981                 'a+-b\x00c\x80',
    982                 'a+-b\x00c\x80d',
    983                 'a+-b\x00c\x80d',
    984                 'a+-b\x00c\x80d',
    985                 'a+-b\x00c\x80d',
    986                 'a+-b\x00c\x80d',
    987                 'a+-b\x00c\x80d\u0100',
    988                 'a+-b\x00c\x80d\u0100e',
    989                 'a+-b\x00c\x80d\u0100e',
    990                 'a+-b\x00c\x80d\u0100e',
    991                 'a+-b\x00c\x80d\u0100e',
    992                 'a+-b\x00c\x80d\u0100e',
    993                 'a+-b\x00c\x80d\u0100e',
    994                 'a+-b\x00c\x80d\u0100e',
    995                 'a+-b\x00c\x80d\u0100e',
    996                 'a+-b\x00c\x80d\u0100e\U00010000',
    997                 'a+-b\x00c\x80d\u0100e\U00010000f',
    998             ]
    999         )
   1000 
   1001     def test_errors(self):
   1002         tests = [
   1003             (b'\xffb', '\ufffdb'),
   1004             (b'a\xffb', 'a\ufffdb'),
   1005             (b'a\xff\xffb', 'a\ufffd\ufffdb'),
   1006             (b'a+IK', 'a\ufffd'),
   1007             (b'a+IK-b', 'a\ufffdb'),
   1008             (b'a+IK,b', 'a\ufffdb'),
   1009             (b'a+IKx', 'a\u20ac\ufffd'),
   1010             (b'a+IKx-b', 'a\u20ac\ufffdb'),
   1011             (b'a+IKwgr', 'a\u20ac\ufffd'),
   1012             (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
   1013             (b'a+IKwgr,', 'a\u20ac\ufffd'),
   1014             (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
   1015             (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
   1016             (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
   1017             (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
   1018             (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
   1019             (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
   1020             (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
   1021             (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
   1022             (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
   1023         ]
   1024         for raw, expected in tests:
   1025             with self.subTest(raw=raw):
   1026                 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
   1027                                 raw, 'strict', True)
   1028                 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
   1029 
   1030     def test_nonbmp(self):
   1031         self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
   1032         self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
   1033         self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
   1034         self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
   1035         self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
   1036         self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
   1037         self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
   1038         self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
   1039                          b'+IKwgrNgB3KA-')
   1040         self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
   1041                          '\u20ac\u20ac\U000104A0')
   1042         self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
   1043                          '\u20ac\u20ac\U000104A0')
   1044 
   1045     def test_lone_surrogates(self):
   1046         tests = [
   1047             (b'a+2AE-b', 'a\ud801b'),
   1048             (b'a+2AE\xffb', 'a\ufffdb'),
   1049             (b'a+2AE', 'a\ufffd'),
   1050             (b'a+2AEA-b', 'a\ufffdb'),
   1051             (b'a+2AH-b', 'a\ufffdb'),
   1052             (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
   1053             (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
   1054             (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
   1055             (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
   1056             (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
   1057             (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
   1058             (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
   1059             (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
   1060         ]
   1061         for raw, expected in tests:
   1062             with self.subTest(raw=raw):
   1063                 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
   1064 
   1065 
   1066 class UTF16ExTest(unittest.TestCase):
   1067 
   1068     def test_errors(self):
   1069         self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
   1070 
   1071     def test_bad_args(self):
   1072         self.assertRaises(TypeError, codecs.utf_16_ex_decode)
   1073 
   1074 class ReadBufferTest(unittest.TestCase):
   1075 
   1076     def test_array(self):
   1077         import array
   1078         self.assertEqual(
   1079             codecs.readbuffer_encode(array.array("b", b"spam")),
   1080             (b"spam", 4)
   1081         )
   1082 
   1083     def test_empty(self):
   1084         self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
   1085 
   1086     def test_bad_args(self):
   1087         self.assertRaises(TypeError, codecs.readbuffer_encode)
   1088         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
   1089 
   1090 class UTF8SigTest(UTF8Test, unittest.TestCase):
   1091     encoding = "utf-8-sig"
   1092     BOM = codecs.BOM_UTF8
   1093 
   1094     def test_partial(self):
   1095         self.check_partial(
   1096             "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
   1097             [
   1098                 "",
   1099                 "",
   1100                 "", # First BOM has been read and skipped
   1101                 "",
   1102                 "",
   1103                 "\ufeff", # Second BOM has been read and emitted
   1104                 "\ufeff\x00", # "\x00" read and emitted
   1105                 "\ufeff\x00", # First byte of encoded "\xff" read
   1106                 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
   1107                 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
   1108                 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
   1109                 "\ufeff\x00\xff\u07ff",
   1110                 "\ufeff\x00\xff\u07ff",
   1111                 "\ufeff\x00\xff\u07ff\u0800",
   1112                 "\ufeff\x00\xff\u07ff\u0800",
   1113                 "\ufeff\x00\xff\u07ff\u0800",
   1114                 "\ufeff\x00\xff\u07ff\u0800\uffff",
   1115                 "\ufeff\x00\xff\u07ff\u0800\uffff",
   1116                 "\ufeff\x00\xff\u07ff\u0800\uffff",
   1117                 "\ufeff\x00\xff\u07ff\u0800\uffff",
   1118                 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
   1119             ]
   1120         )
   1121 
   1122     def test_bug1601501(self):
   1123         # SF bug #1601501: check that the codec works with a buffer
   1124         self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
   1125 
   1126     def test_bom(self):
   1127         d = codecs.getincrementaldecoder("utf-8-sig")()
   1128         s = "spam"
   1129         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
   1130 
   1131     def test_stream_bom(self):
   1132         unistring = "ABC\u00A1\u2200XYZ"
   1133         bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
   1134 
   1135         reader = codecs.getreader("utf-8-sig")
   1136         for sizehint in [None] + list(range(1, 11)) + \
   1137                         [64, 128, 256, 512, 1024]:
   1138             istream = reader(io.BytesIO(bytestring))
   1139             ostream = io.StringIO()
   1140             while 1:
   1141                 if sizehint is not None:
   1142                     data = istream.read(sizehint)
   1143                 else:
   1144                     data = istream.read()
   1145 
   1146                 if not data:
   1147                     break
   1148                 ostream.write(data)
   1149 
   1150             got = ostream.getvalue()
   1151             self.assertEqual(got, unistring)
   1152 
   1153     def test_stream_bare(self):
   1154         unistring = "ABC\u00A1\u2200XYZ"
   1155         bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
   1156 
   1157         reader = codecs.getreader("utf-8-sig")
   1158         for sizehint in [None] + list(range(1, 11)) + \
   1159                         [64, 128, 256, 512, 1024]:
   1160             istream = reader(io.BytesIO(bytestring))
   1161             ostream = io.StringIO()
   1162             while 1:
   1163                 if sizehint is not None:
   1164                     data = istream.read(sizehint)
   1165                 else:
   1166                     data = istream.read()
   1167 
   1168                 if not data:
   1169                     break
   1170                 ostream.write(data)
   1171 
   1172             got = ostream.getvalue()
   1173             self.assertEqual(got, unistring)
   1174 
   1175 class EscapeDecodeTest(unittest.TestCase):
   1176     def test_empty(self):
   1177         self.assertEqual(codecs.escape_decode(b""), (b"", 0))
   1178         self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
   1179 
   1180     def test_raw(self):
   1181         decode = codecs.escape_decode
   1182         for b in range(256):
   1183             b = bytes([b])
   1184             if b != b'\\':
   1185                 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
   1186 
   1187     def test_escape(self):
   1188         decode = codecs.escape_decode
   1189         check = coding_checker(self, decode)
   1190         check(b"[\\\n]", b"[]")
   1191         check(br'[\"]', b'["]')
   1192         check(br"[\']", b"[']")
   1193         check(br"[\\]", b"[\\]")
   1194         check(br"[\a]", b"[\x07]")
   1195         check(br"[\b]", b"[\x08]")
   1196         check(br"[\t]", b"[\x09]")
   1197         check(br"[\n]", b"[\x0a]")
   1198         check(br"[\v]", b"[\x0b]")
   1199         check(br"[\f]", b"[\x0c]")
   1200         check(br"[\r]", b"[\x0d]")
   1201         check(br"[\7]", b"[\x07]")
   1202         check(br"[\78]", b"[\x078]")
   1203         check(br"[\41]", b"[!]")
   1204         check(br"[\418]", b"[!8]")
   1205         check(br"[\101]", b"[A]")
   1206         check(br"[\1010]", b"[A0]")
   1207         check(br"[\501]", b"[A]")
   1208         check(br"[\x41]", b"[A]")
   1209         check(br"[\x410]", b"[A0]")
   1210         for i in range(97, 123):
   1211             b = bytes([i])
   1212             if b not in b'abfnrtvx':
   1213                 with self.assertWarns(DeprecationWarning):
   1214                     check(b"\\" + b, b"\\" + b)
   1215             with self.assertWarns(DeprecationWarning):
   1216                 check(b"\\" + b.upper(), b"\\" + b.upper())
   1217         with self.assertWarns(DeprecationWarning):
   1218             check(br"\8", b"\\8")
   1219         with self.assertWarns(DeprecationWarning):
   1220             check(br"\9", b"\\9")
   1221         with self.assertWarns(DeprecationWarning):
   1222             check(b"\\\xfa", b"\\\xfa")
   1223 
   1224     def test_errors(self):
   1225         decode = codecs.escape_decode
   1226         self.assertRaises(ValueError, decode, br"\x")
   1227         self.assertRaises(ValueError, decode, br"[\x]")
   1228         self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
   1229         self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
   1230         self.assertRaises(ValueError, decode, br"\x0")
   1231         self.assertRaises(ValueError, decode, br"[\x0]")
   1232         self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
   1233         self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
   1234 
   1235 
   1236 class RecodingTest(unittest.TestCase):
   1237     def test_recoding(self):
   1238         f = io.BytesIO()
   1239         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
   1240         f2.write("a")
   1241         f2.close()
   1242         # Python used to crash on this at exit because of a refcount
   1243         # bug in _codecsmodule.c
   1244 
   1245         self.assertTrue(f.closed)
   1246 
   1247 # From RFC 3492
   1248 punycode_testcases = [
   1249     # A Arabic (Egyptian):
   1250     ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
   1251      "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
   1252      b"egbpdaj6bu4bxfgehfvwxn"),
   1253     # B Chinese (simplified):
   1254     ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
   1255      b"ihqwcrb4cv8a8dqg056pqjye"),
   1256     # C Chinese (traditional):
   1257     ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
   1258      b"ihqwctvzc91f659drss3x8bo0yb"),
   1259     # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
   1260     ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
   1261      "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
   1262      "\u0065\u0073\u006B\u0079",
   1263      b"Proprostnemluvesky-uyb24dma41a"),
   1264     # E Hebrew:
   1265     ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
   1266      "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
   1267      "\u05D1\u05E8\u05D9\u05EA",
   1268      b"4dbcagdahymbxekheh6e0a7fei0b"),
   1269     # F Hindi (Devanagari):
   1270     ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
   1271      "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
   1272      "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
   1273      "\u0939\u0948\u0902",
   1274      b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
   1275 
   1276     #(G) Japanese (kanji and hiragana):
   1277     ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
   1278      "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
   1279      b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
   1280 
   1281     # (H) Korean (Hangul syllables):
   1282     ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
   1283      "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
   1284      "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
   1285      b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
   1286      b"psd879ccm6fea98c"),
   1287 
   1288     # (I) Russian (Cyrillic):
   1289     ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
   1290      "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
   1291      "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
   1292      "\u0438",
   1293      b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
   1294 
   1295     # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
   1296     ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
   1297      "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
   1298      "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
   1299      "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
   1300      "\u0061\u00F1\u006F\u006C",
   1301      b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
   1302 
   1303     # (K) Vietnamese:
   1304     #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
   1305     #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
   1306     ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
   1307      "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
   1308      "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
   1309      "\u0056\u0069\u1EC7\u0074",
   1310      b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
   1311 
   1312     #(L) 3<nen>B<gumi><kinpachi><sensei>
   1313     ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
   1314      b"3B-ww4c5e180e575a65lsy2b"),
   1315 
   1316     # (M) <amuro><namie>-with-SUPER-MONKEYS
   1317     ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
   1318      "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
   1319      "\u004F\u004E\u004B\u0045\u0059\u0053",
   1320      b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
   1321 
   1322     # (N) Hello-Another-Way-<sorezore><no><basho>
   1323     ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
   1324      "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
   1325      "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
   1326      b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
   1327 
   1328     # (O) <hitotsu><yane><no><shita>2
   1329     ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
   1330      b"2-u9tlzr9756bt3uc0v"),
   1331 
   1332     # (P) Maji<de>Koi<suru>5<byou><mae>
   1333     ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
   1334      "\u308B\u0035\u79D2\u524D",
   1335      b"MajiKoi5-783gue6qz075azm5e"),
   1336 
   1337      # (Q) <pafii>de<runba>
   1338     ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
   1339      b"de-jg4avhby1noc0d"),
   1340 
   1341     # (R) <sono><supiido><de>
   1342     ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
   1343      b"d9juau41awczczp"),
   1344 
   1345     # (S) -> $1.00 <-
   1346     ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
   1347      "\u003C\u002D",
   1348      b"-> $1.00 <--")
   1349     ]
   1350 
   1351 for i in punycode_testcases:
   1352     if len(i)!=2:
   1353         print(repr(i))
   1354 
   1355 
   1356 class PunycodeTest(unittest.TestCase):
   1357     def test_encode(self):
   1358         for uni, puny in punycode_testcases:
   1359             # Need to convert both strings to lower case, since
   1360             # some of the extended encodings use upper case, but our
   1361             # code produces only lower case. Converting just puny to
   1362             # lower is also insufficient, since some of the input characters
   1363             # are upper case.
   1364             self.assertEqual(
   1365                 str(uni.encode("punycode"), "ascii").lower(),
   1366                 str(puny, "ascii").lower()
   1367             )
   1368 
   1369     def test_decode(self):
   1370         for uni, puny in punycode_testcases:
   1371             self.assertEqual(uni, puny.decode("punycode"))
   1372             puny = puny.decode("ascii").encode("ascii")
   1373             self.assertEqual(uni, puny.decode("punycode"))
   1374 
   1375 
   1376 class UnicodeInternalTest(unittest.TestCase):
   1377     @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
   1378     def test_bug1251300(self):
   1379         # Decoding with unicode_internal used to not correctly handle "code
   1380         # points" above 0x10ffff on UCS-4 builds.
   1381         ok = [
   1382             (b"\x00\x10\xff\xff", "\U0010ffff"),
   1383             (b"\x00\x00\x01\x01", "\U00000101"),
   1384             (b"", ""),
   1385         ]
   1386         not_ok = [
   1387             b"\x7f\xff\xff\xff",
   1388             b"\x80\x00\x00\x00",
   1389             b"\x81\x00\x00\x00",
   1390             b"\x00",
   1391             b"\x00\x00\x00\x00\x00",
   1392         ]
   1393         for internal, uni in ok:
   1394             if sys.byteorder == "little":
   1395                 internal = bytes(reversed(internal))
   1396             with support.check_warnings():
   1397                 self.assertEqual(uni, internal.decode("unicode_internal"))
   1398         for internal in not_ok:
   1399             if sys.byteorder == "little":
   1400                 internal = bytes(reversed(internal))
   1401             with support.check_warnings(('unicode_internal codec has been '
   1402                                          'deprecated', DeprecationWarning)):
   1403                 self.assertRaises(UnicodeDecodeError, internal.decode,
   1404                                   "unicode_internal")
   1405         if sys.byteorder == "little":
   1406             invalid = b"\x00\x00\x11\x00"
   1407             invalid_backslashreplace = r"\x00\x00\x11\x00"
   1408         else:
   1409             invalid = b"\x00\x11\x00\x00"
   1410             invalid_backslashreplace = r"\x00\x11\x00\x00"
   1411         with support.check_warnings():
   1412             self.assertRaises(UnicodeDecodeError,
   1413                               invalid.decode, "unicode_internal")
   1414         with support.check_warnings():
   1415             self.assertEqual(invalid.decode("unicode_internal", "replace"),
   1416                              '\ufffd')
   1417         with support.check_warnings():
   1418             self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
   1419                              invalid_backslashreplace)
   1420 
   1421     @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
   1422     def test_decode_error_attributes(self):
   1423         try:
   1424             with support.check_warnings(('unicode_internal codec has been '
   1425                                          'deprecated', DeprecationWarning)):
   1426                 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
   1427         except UnicodeDecodeError as ex:
   1428             self.assertEqual("unicode_internal", ex.encoding)
   1429             self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
   1430             self.assertEqual(4, ex.start)
   1431             self.assertEqual(8, ex.end)
   1432         else:
   1433             self.fail()
   1434 
   1435     @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
   1436     def test_decode_callback(self):
   1437         codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
   1438         decoder = codecs.getdecoder("unicode_internal")
   1439         with support.check_warnings(('unicode_internal codec has been '
   1440                                      'deprecated', DeprecationWarning)):
   1441             ab = "ab".encode("unicode_internal").decode()
   1442             ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
   1443                                     "ascii"),
   1444                               "UnicodeInternalTest")
   1445         self.assertEqual(("ab", 12), ignored)
   1446 
   1447     def test_encode_length(self):
   1448         with support.check_warnings(('unicode_internal codec has been '
   1449                                      'deprecated', DeprecationWarning)):
   1450             # Issue 3739
   1451             encoder = codecs.getencoder("unicode_internal")
   1452             self.assertEqual(encoder("a")[1], 1)
   1453             self.assertEqual(encoder("\xe9\u0142")[1], 2)
   1454 
   1455             self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
   1456 
   1457 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
   1458 nameprep_tests = [
   1459     # 3.1 Map to nothing.
   1460     (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
   1461      b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
   1462      b'\xb8\x8f\xef\xbb\xbf',
   1463      b'foobarbaz'),
   1464     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
   1465     (b'CAFE',
   1466      b'cafe'),
   1467     # 3.3 Case folding 8bit U+00DF (german sharp s).
   1468     # The original test case is bogus; it says \xc3\xdf
   1469     (b'\xc3\x9f',
   1470      b'ss'),
   1471     # 3.4 Case folding U+0130 (turkish capital I with dot).
   1472     (b'\xc4\xb0',
   1473      b'i\xcc\x87'),
   1474     # 3.5 Case folding multibyte U+0143 U+037A.
   1475     (b'\xc5\x83\xcd\xba',
   1476      b'\xc5\x84 \xce\xb9'),
   1477     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
   1478     # XXX: skip this as it fails in UCS-2 mode
   1479     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
   1480     # 'telc\xe2\x88\x95kg\xcf\x83'),
   1481     (None, None),
   1482     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
   1483     (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
   1484      b'\xc7\xb0 a'),
   1485     # 3.8 Case folding U+1FB7 and normalization.
   1486     (b'\xe1\xbe\xb7',
   1487      b'\xe1\xbe\xb6\xce\xb9'),
   1488     # 3.9 Self-reverting case folding U+01F0 and normalization.
   1489     # The original test case is bogus, it says `\xc7\xf0'
   1490     (b'\xc7\xb0',
   1491      b'\xc7\xb0'),
   1492     # 3.10 Self-reverting case folding U+0390 and normalization.
   1493     (b'\xce\x90',
   1494      b'\xce\x90'),
   1495     # 3.11 Self-reverting case folding U+03B0 and normalization.
   1496     (b'\xce\xb0',
   1497      b'\xce\xb0'),
   1498     # 3.12 Self-reverting case folding U+1E96 and normalization.
   1499     (b'\xe1\xba\x96',
   1500      b'\xe1\xba\x96'),
   1501     # 3.13 Self-reverting case folding U+1F56 and normalization.
   1502     (b'\xe1\xbd\x96',
   1503      b'\xe1\xbd\x96'),
   1504     # 3.14 ASCII space character U+0020.
   1505     (b' ',
   1506      b' '),
   1507     # 3.15 Non-ASCII 8bit space character U+00A0.
   1508     (b'\xc2\xa0',
   1509      b' '),
   1510     # 3.16 Non-ASCII multibyte space character U+1680.
   1511     (b'\xe1\x9a\x80',
   1512      None),
   1513     # 3.17 Non-ASCII multibyte space character U+2000.
   1514     (b'\xe2\x80\x80',
   1515      b' '),
   1516     # 3.18 Zero Width Space U+200b.
   1517     (b'\xe2\x80\x8b',
   1518      b''),
   1519     # 3.19 Non-ASCII multibyte space character U+3000.
   1520     (b'\xe3\x80\x80',
   1521      b' '),
   1522     # 3.20 ASCII control characters U+0010 U+007F.
   1523     (b'\x10\x7f',
   1524      b'\x10\x7f'),
   1525     # 3.21 Non-ASCII 8bit control character U+0085.
   1526     (b'\xc2\x85',
   1527      None),
   1528     # 3.22 Non-ASCII multibyte control character U+180E.
   1529     (b'\xe1\xa0\x8e',
   1530      None),
   1531     # 3.23 Zero Width No-Break Space U+FEFF.
   1532     (b'\xef\xbb\xbf',
   1533      b''),
   1534     # 3.24 Non-ASCII control character U+1D175.
   1535     (b'\xf0\x9d\x85\xb5',
   1536      None),
   1537     # 3.25 Plane 0 private use character U+F123.
   1538     (b'\xef\x84\xa3',
   1539      None),
   1540     # 3.26 Plane 15 private use character U+F1234.
   1541     (b'\xf3\xb1\x88\xb4',
   1542      None),
   1543     # 3.27 Plane 16 private use character U+10F234.
   1544     (b'\xf4\x8f\x88\xb4',
   1545      None),
   1546     # 3.28 Non-character code point U+8FFFE.
   1547     (b'\xf2\x8f\xbf\xbe',
   1548      None),
   1549     # 3.29 Non-character code point U+10FFFF.
   1550     (b'\xf4\x8f\xbf\xbf',
   1551      None),
   1552     # 3.30 Surrogate code U+DF42.
   1553     (b'\xed\xbd\x82',
   1554      None),
   1555     # 3.31 Non-plain text character U+FFFD.
   1556     (b'\xef\xbf\xbd',
   1557      None),
   1558     # 3.32 Ideographic description character U+2FF5.
   1559     (b'\xe2\xbf\xb5',
   1560      None),
   1561     # 3.33 Display property character U+0341.
   1562     (b'\xcd\x81',
   1563      b'\xcc\x81'),
   1564     # 3.34 Left-to-right mark U+200E.
   1565     (b'\xe2\x80\x8e',
   1566      None),
   1567     # 3.35 Deprecated U+202A.
   1568     (b'\xe2\x80\xaa',
   1569      None),
   1570     # 3.36 Language tagging character U+E0001.
   1571     (b'\xf3\xa0\x80\x81',
   1572      None),
   1573     # 3.37 Language tagging character U+E0042.
   1574     (b'\xf3\xa0\x81\x82',
   1575      None),
   1576     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
   1577     (b'foo\xd6\xbebar',
   1578      None),
   1579     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
   1580     (b'foo\xef\xb5\x90bar',
   1581      None),
   1582     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
   1583     (b'foo\xef\xb9\xb6bar',
   1584      b'foo \xd9\x8ebar'),
   1585     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
   1586     (b'\xd8\xa71',
   1587      None),
   1588     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
   1589     (b'\xd8\xa71\xd8\xa8',
   1590      b'\xd8\xa71\xd8\xa8'),
   1591     # 3.43 Unassigned code point U+E0002.
   1592     # Skip this test as we allow unassigned
   1593     #(b'\xf3\xa0\x80\x82',
   1594     # None),
   1595     (None, None),
   1596     # 3.44 Larger test (shrinking).
   1597     # Original test case reads \xc3\xdf
   1598     (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
   1599      b'\xaa\xce\xb0\xe2\x80\x80',
   1600      b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
   1601     # 3.45 Larger test (expanding).
   1602     # Original test case reads \xc3\x9f
   1603     (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
   1604      b'\x80',
   1605      b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
   1606      b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
   1607      b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
   1608     ]
   1609 
   1610 
   1611 class NameprepTest(unittest.TestCase):
   1612     def test_nameprep(self):
   1613         from encodings.idna import nameprep
   1614         for pos, (orig, prepped) in enumerate(nameprep_tests):
   1615             if orig is None:
   1616                 # Skipped
   1617                 continue
   1618             # The Unicode strings are given in UTF-8
   1619             orig = str(orig, "utf-8", "surrogatepass")
   1620             if prepped is None:
   1621                 # Input contains prohibited characters
   1622                 self.assertRaises(UnicodeError, nameprep, orig)
   1623             else:
   1624                 prepped = str(prepped, "utf-8", "surrogatepass")
   1625                 try:
   1626                     self.assertEqual(nameprep(orig), prepped)
   1627                 except Exception as e:
   1628                     raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
   1629 
   1630 
   1631 class IDNACodecTest(unittest.TestCase):
   1632     def test_builtin_decode(self):
   1633         self.assertEqual(str(b"python.org", "idna"), "python.org")
   1634         self.assertEqual(str(b"python.org.", "idna"), "python.org.")
   1635         self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
   1636         self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
   1637 
   1638     def test_builtin_encode(self):
   1639         self.assertEqual("python.org".encode("idna"), b"python.org")
   1640         self.assertEqual("python.org.".encode("idna"), b"python.org.")
   1641         self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
   1642         self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
   1643 
   1644     def test_stream(self):
   1645         r = codecs.getreader("idna")(io.BytesIO(b"abc"))
   1646         r.read(3)
   1647         self.assertEqual(r.read(), "")
   1648 
   1649     def test_incremental_decode(self):
   1650         self.assertEqual(
   1651             "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
   1652             "python.org"
   1653         )
   1654         self.assertEqual(
   1655             "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
   1656             "python.org."
   1657         )
   1658         self.assertEqual(
   1659             "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
   1660             "pyth\xf6n.org."
   1661         )
   1662         self.assertEqual(
   1663             "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
   1664             "pyth\xf6n.org."
   1665         )
   1666 
   1667         decoder = codecs.getincrementaldecoder("idna")()
   1668         self.assertEqual(decoder.decode(b"xn--xam", ), "")
   1669         self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
   1670         self.assertEqual(decoder.decode(b"rg"), "")
   1671         self.assertEqual(decoder.decode(b"", True), "org")
   1672 
   1673         decoder.reset()
   1674         self.assertEqual(decoder.decode(b"xn--xam", ), "")
   1675         self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
   1676         self.assertEqual(decoder.decode(b"rg."), "org.")
   1677         self.assertEqual(decoder.decode(b"", True), "")
   1678 
   1679     def test_incremental_encode(self):
   1680         self.assertEqual(
   1681             b"".join(codecs.iterencode("python.org", "idna")),
   1682             b"python.org"
   1683         )
   1684         self.assertEqual(
   1685             b"".join(codecs.iterencode("python.org.", "idna")),
   1686             b"python.org."
   1687         )
   1688         self.assertEqual(
   1689             b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
   1690             b"xn--pythn-mua.org."
   1691         )
   1692         self.assertEqual(
   1693             b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
   1694             b"xn--pythn-mua.org."
   1695         )
   1696 
   1697         encoder = codecs.getincrementalencoder("idna")()
   1698         self.assertEqual(encoder.encode("\xe4x"), b"")
   1699         self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
   1700         self.assertEqual(encoder.encode("", True), b"org")
   1701 
   1702         encoder.reset()
   1703         self.assertEqual(encoder.encode("\xe4x"), b"")
   1704         self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
   1705         self.assertEqual(encoder.encode("", True), b"")
   1706 
   1707     def test_errors(self):
   1708         """Only supports "strict" error handler"""
   1709         "python.org".encode("idna", "strict")
   1710         b"python.org".decode("idna", "strict")
   1711         for errors in ("ignore", "replace", "backslashreplace",
   1712                 "surrogateescape"):
   1713             self.assertRaises(Exception, "python.org".encode, "idna", errors)
   1714             self.assertRaises(Exception,
   1715                 b"python.org".decode, "idna", errors)
   1716 
   1717 
   1718 class CodecsModuleTest(unittest.TestCase):
   1719 
   1720     def test_decode(self):
   1721         self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
   1722                          '\xe4\xf6\xfc')
   1723         self.assertRaises(TypeError, codecs.decode)
   1724         self.assertEqual(codecs.decode(b'abc'), 'abc')
   1725         self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
   1726 
   1727         # test keywords
   1728         self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
   1729                          '\xe4\xf6\xfc')
   1730         self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
   1731                          '[]')
   1732 
   1733     def test_encode(self):
   1734         self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
   1735                          b'\xe4\xf6\xfc')
   1736         self.assertRaises(TypeError, codecs.encode)
   1737         self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
   1738         self.assertEqual(codecs.encode('abc'), b'abc')
   1739         self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
   1740 
   1741         # test keywords
   1742         self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
   1743                          b'\xe4\xf6\xfc')
   1744         self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
   1745                          b'[]')
   1746 
   1747     def test_register(self):
   1748         self.assertRaises(TypeError, codecs.register)
   1749         self.assertRaises(TypeError, codecs.register, 42)
   1750 
   1751     def test_lookup(self):
   1752         self.assertRaises(TypeError, codecs.lookup)
   1753         self.assertRaises(LookupError, codecs.lookup, "__spam__")
   1754         self.assertRaises(LookupError, codecs.lookup, " ")
   1755 
   1756     def test_getencoder(self):
   1757         self.assertRaises(TypeError, codecs.getencoder)
   1758         self.assertRaises(LookupError, codecs.getencoder, "__spam__")
   1759 
   1760     def test_getdecoder(self):
   1761         self.assertRaises(TypeError, codecs.getdecoder)
   1762         self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
   1763 
   1764     def test_getreader(self):
   1765         self.assertRaises(TypeError, codecs.getreader)
   1766         self.assertRaises(LookupError, codecs.getreader, "__spam__")
   1767 
   1768     def test_getwriter(self):
   1769         self.assertRaises(TypeError, codecs.getwriter)
   1770         self.assertRaises(LookupError, codecs.getwriter, "__spam__")
   1771 
   1772     def test_lookup_issue1813(self):
   1773         # Issue #1813: under Turkish locales, lookup of some codecs failed
   1774         # because 'I' is lowercased as "" (dotless i)
   1775         oldlocale = locale.setlocale(locale.LC_CTYPE)
   1776         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1777         try:
   1778             locale.setlocale(locale.LC_CTYPE, 'tr_TR')
   1779         except locale.Error:
   1780             # Unsupported locale on this system
   1781             self.skipTest('test needs Turkish locale')
   1782         c = codecs.lookup('ASCII')
   1783         self.assertEqual(c.name, 'ascii')
   1784 
   1785     def test_all(self):
   1786         api = (
   1787             "encode", "decode",
   1788             "register", "CodecInfo", "Codec", "IncrementalEncoder",
   1789             "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
   1790             "getencoder", "getdecoder", "getincrementalencoder",
   1791             "getincrementaldecoder", "getreader", "getwriter",
   1792             "register_error", "lookup_error",
   1793             "strict_errors", "replace_errors", "ignore_errors",
   1794             "xmlcharrefreplace_errors", "backslashreplace_errors",
   1795             "namereplace_errors",
   1796             "open", "EncodedFile",
   1797             "iterencode", "iterdecode",
   1798             "BOM", "BOM_BE", "BOM_LE",
   1799             "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
   1800             "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
   1801             "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
   1802             "StreamReaderWriter", "StreamRecoder",
   1803         )
   1804         self.assertCountEqual(api, codecs.__all__)
   1805         for api in codecs.__all__:
   1806             getattr(codecs, api)
   1807 
   1808     def test_open(self):
   1809         self.addCleanup(support.unlink, support.TESTFN)
   1810         for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
   1811             with self.subTest(mode), \
   1812                     codecs.open(support.TESTFN, mode, 'ascii') as file:
   1813                 self.assertIsInstance(file, codecs.StreamReaderWriter)
   1814 
   1815     def test_undefined(self):
   1816         self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
   1817         self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
   1818         self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
   1819         self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
   1820         for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
   1821             self.assertRaises(UnicodeError,
   1822                 codecs.encode, 'abc', 'undefined', errors)
   1823             self.assertRaises(UnicodeError,
   1824                 codecs.decode, b'abc', 'undefined', errors)
   1825 
   1826 
   1827 class StreamReaderTest(unittest.TestCase):
   1828 
   1829     def setUp(self):
   1830         self.reader = codecs.getreader('utf-8')
   1831         self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
   1832 
   1833     def test_readlines(self):
   1834         f = self.reader(self.stream)
   1835         self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
   1836 
   1837 
   1838 class EncodedFileTest(unittest.TestCase):
   1839 
   1840     def test_basic(self):
   1841         f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
   1842         ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
   1843         self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
   1844 
   1845         f = io.BytesIO()
   1846         ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
   1847         ef.write(b'\xc3\xbc')
   1848         self.assertEqual(f.getvalue(), b'\xfc')
   1849 
   1850 all_unicode_encodings = [
   1851     "ascii",
   1852     "big5",
   1853     "big5hkscs",
   1854     "charmap",
   1855     "cp037",
   1856     "cp1006",
   1857     "cp1026",
   1858     "cp1125",
   1859     "cp1140",
   1860     "cp1250",
   1861     "cp1251",
   1862     "cp1252",
   1863     "cp1253",
   1864     "cp1254",
   1865     "cp1255",
   1866     "cp1256",
   1867     "cp1257",
   1868     "cp1258",
   1869     "cp424",
   1870     "cp437",
   1871     "cp500",
   1872     "cp720",
   1873     "cp737",
   1874     "cp775",
   1875     "cp850",
   1876     "cp852",
   1877     "cp855",
   1878     "cp856",
   1879     "cp857",
   1880     "cp858",
   1881     "cp860",
   1882     "cp861",
   1883     "cp862",
   1884     "cp863",
   1885     "cp864",
   1886     "cp865",
   1887     "cp866",
   1888     "cp869",
   1889     "cp874",
   1890     "cp875",
   1891     "cp932",
   1892     "cp949",
   1893     "cp950",
   1894     "euc_jis_2004",
   1895     "euc_jisx0213",
   1896     "euc_jp",
   1897     "euc_kr",
   1898     "gb18030",
   1899     "gb2312",
   1900     "gbk",
   1901     "hp_roman8",
   1902     "hz",
   1903     "idna",
   1904     "iso2022_jp",
   1905     "iso2022_jp_1",
   1906     "iso2022_jp_2",
   1907     "iso2022_jp_2004",
   1908     "iso2022_jp_3",
   1909     "iso2022_jp_ext",
   1910     "iso2022_kr",
   1911     "iso8859_1",
   1912     "iso8859_10",
   1913     "iso8859_11",
   1914     "iso8859_13",
   1915     "iso8859_14",
   1916     "iso8859_15",
   1917     "iso8859_16",
   1918     "iso8859_2",
   1919     "iso8859_3",
   1920     "iso8859_4",
   1921     "iso8859_5",
   1922     "iso8859_6",
   1923     "iso8859_7",
   1924     "iso8859_8",
   1925     "iso8859_9",
   1926     "johab",
   1927     "koi8_r",
   1928     "koi8_t",
   1929     "koi8_u",
   1930     "kz1048",
   1931     "latin_1",
   1932     "mac_cyrillic",
   1933     "mac_greek",
   1934     "mac_iceland",
   1935     "mac_latin2",
   1936     "mac_roman",
   1937     "mac_turkish",
   1938     "palmos",
   1939     "ptcp154",
   1940     "punycode",
   1941     "raw_unicode_escape",
   1942     "shift_jis",
   1943     "shift_jis_2004",
   1944     "shift_jisx0213",
   1945     "tis_620",
   1946     "unicode_escape",
   1947     "unicode_internal",
   1948     "utf_16",
   1949     "utf_16_be",
   1950     "utf_16_le",
   1951     "utf_7",
   1952     "utf_8",
   1953 ]
   1954 
   1955 if hasattr(codecs, "mbcs_encode"):
   1956     all_unicode_encodings.append("mbcs")
   1957 if hasattr(codecs, "oem_encode"):
   1958     all_unicode_encodings.append("oem")
   1959 
   1960 # The following encoding is not tested, because it's not supposed
   1961 # to work:
   1962 #    "undefined"
   1963 
   1964 # The following encodings don't work in stateful mode
   1965 broken_unicode_with_stateful = [
   1966     "punycode",
   1967     "unicode_internal"
   1968 ]
   1969 
   1970 
   1971 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
   1972     def test_basics(self):
   1973         s = "abc123"  # all codecs should be able to encode these
   1974         for encoding in all_unicode_encodings:
   1975             name = codecs.lookup(encoding).name
   1976             if encoding.endswith("_codec"):
   1977                 name += "_codec"
   1978             elif encoding == "latin_1":
   1979                 name = "latin_1"
   1980             self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
   1981 
   1982             with support.check_warnings():
   1983                 # unicode-internal has been deprecated
   1984                 (b, size) = codecs.getencoder(encoding)(s)
   1985                 self.assertEqual(size, len(s), "encoding=%r" % encoding)
   1986                 (chars, size) = codecs.getdecoder(encoding)(b)
   1987                 self.assertEqual(chars, s, "encoding=%r" % encoding)
   1988 
   1989             if encoding not in broken_unicode_with_stateful:
   1990                 # check stream reader/writer
   1991                 q = Queue(b"")
   1992                 writer = codecs.getwriter(encoding)(q)
   1993                 encodedresult = b""
   1994                 for c in s:
   1995                     writer.write(c)
   1996                     chunk = q.read()
   1997                     self.assertTrue(type(chunk) is bytes, type(chunk))
   1998                     encodedresult += chunk
   1999                 q = Queue(b"")
   2000                 reader = codecs.getreader(encoding)(q)
   2001                 decodedresult = ""
   2002                 for c in encodedresult:
   2003                     q.write(bytes([c]))
   2004                     decodedresult += reader.read()
   2005                 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
   2006 
   2007             if encoding not in broken_unicode_with_stateful:
   2008                 # check incremental decoder/encoder and iterencode()/iterdecode()
   2009                 try:
   2010                     encoder = codecs.getincrementalencoder(encoding)()
   2011                 except LookupError:  # no IncrementalEncoder
   2012                     pass
   2013                 else:
   2014                     # check incremental decoder/encoder
   2015                     encodedresult = b""
   2016                     for c in s:
   2017                         encodedresult += encoder.encode(c)
   2018                     encodedresult += encoder.encode("", True)
   2019                     decoder = codecs.getincrementaldecoder(encoding)()
   2020                     decodedresult = ""
   2021                     for c in encodedresult:
   2022                         decodedresult += decoder.decode(bytes([c]))
   2023                     decodedresult += decoder.decode(b"", True)
   2024                     self.assertEqual(decodedresult, s,
   2025                                      "encoding=%r" % encoding)
   2026 
   2027                     # check iterencode()/iterdecode()
   2028                     result = "".join(codecs.iterdecode(
   2029                             codecs.iterencode(s, encoding), encoding))
   2030                     self.assertEqual(result, s, "encoding=%r" % encoding)
   2031 
   2032                     # check iterencode()/iterdecode() with empty string
   2033                     result = "".join(codecs.iterdecode(
   2034                             codecs.iterencode("", encoding), encoding))
   2035                     self.assertEqual(result, "")
   2036 
   2037                 if encoding not in ("idna", "mbcs"):
   2038                     # check incremental decoder/encoder with errors argument
   2039                     try:
   2040                         encoder = codecs.getincrementalencoder(encoding)("ignore")
   2041                     except LookupError:  # no IncrementalEncoder
   2042                         pass
   2043                     else:
   2044                         encodedresult = b"".join(encoder.encode(c) for c in s)
   2045                         decoder = codecs.getincrementaldecoder(encoding)("ignore")
   2046                         decodedresult = "".join(decoder.decode(bytes([c]))
   2047                                                 for c in encodedresult)
   2048                         self.assertEqual(decodedresult, s,
   2049                                          "encoding=%r" % encoding)
   2050 
   2051     @support.cpython_only
   2052     def test_basics_capi(self):
   2053         from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
   2054         s = "abc123"  # all codecs should be able to encode these
   2055         for encoding in all_unicode_encodings:
   2056             if encoding not in broken_unicode_with_stateful:
   2057                 # check incremental decoder/encoder (fetched via the C API)
   2058                 try:
   2059                     cencoder = codec_incrementalencoder(encoding)
   2060                 except LookupError:  # no IncrementalEncoder
   2061                     pass
   2062                 else:
   2063                     # check C API
   2064                     encodedresult = b""
   2065                     for c in s:
   2066                         encodedresult += cencoder.encode(c)
   2067                     encodedresult += cencoder.encode("", True)
   2068                     cdecoder = codec_incrementaldecoder(encoding)
   2069                     decodedresult = ""
   2070                     for c in encodedresult:
   2071                         decodedresult += cdecoder.decode(bytes([c]))
   2072                     decodedresult += cdecoder.decode(b"", True)
   2073                     self.assertEqual(decodedresult, s,
   2074                                      "encoding=%r" % encoding)
   2075 
   2076                 if encoding not in ("idna", "mbcs"):
   2077                     # check incremental decoder/encoder with errors argument
   2078                     try:
   2079                         cencoder = codec_incrementalencoder(encoding, "ignore")
   2080                     except LookupError:  # no IncrementalEncoder
   2081                         pass
   2082                     else:
   2083                         encodedresult = b"".join(cencoder.encode(c) for c in s)
   2084                         cdecoder = codec_incrementaldecoder(encoding, "ignore")
   2085                         decodedresult = "".join(cdecoder.decode(bytes([c]))
   2086                                                 for c in encodedresult)
   2087                         self.assertEqual(decodedresult, s,
   2088                                          "encoding=%r" % encoding)
   2089 
   2090     def test_seek(self):
   2091         # all codecs should be able to encode these
   2092         s = "%s\n%s\n" % (100*"abc123", 100*"def456")
   2093         for encoding in all_unicode_encodings:
   2094             if encoding == "idna": # FIXME: See SF bug #1163178
   2095                 continue
   2096             if encoding in broken_unicode_with_stateful:
   2097                 continue
   2098             reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
   2099             for t in range(5):
   2100                 # Test that calling seek resets the internal codec state and buffers
   2101                 reader.seek(0, 0)
   2102                 data = reader.read()
   2103                 self.assertEqual(s, data)
   2104 
   2105     def test_bad_decode_args(self):
   2106         for encoding in all_unicode_encodings:
   2107             decoder = codecs.getdecoder(encoding)
   2108             self.assertRaises(TypeError, decoder)
   2109             if encoding not in ("idna", "punycode"):
   2110                 self.assertRaises(TypeError, decoder, 42)
   2111 
   2112     def test_bad_encode_args(self):
   2113         for encoding in all_unicode_encodings:
   2114             encoder = codecs.getencoder(encoding)
   2115             with support.check_warnings():
   2116                 # unicode-internal has been deprecated
   2117                 self.assertRaises(TypeError, encoder)
   2118 
   2119     def test_encoding_map_type_initialized(self):
   2120         from encodings import cp1140
   2121         # This used to crash, we are only verifying there's no crash.
   2122         table_type = type(cp1140.encoding_table)
   2123         self.assertEqual(table_type, table_type)
   2124 
   2125     def test_decoder_state(self):
   2126         # Check that getstate() and setstate() handle the state properly
   2127         u = "abc123"
   2128         for encoding in all_unicode_encodings:
   2129             if encoding not in broken_unicode_with_stateful:
   2130                 self.check_state_handling_decode(encoding, u, u.encode(encoding))
   2131                 self.check_state_handling_encode(encoding, u, u.encode(encoding))
   2132 
   2133 
   2134 class CharmapTest(unittest.TestCase):
   2135     def test_decode_with_string_map(self):
   2136         self.assertEqual(
   2137             codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
   2138             ("abc", 3)
   2139         )
   2140 
   2141         self.assertEqual(
   2142             codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
   2143             ("\U0010FFFFbc", 3)
   2144         )
   2145 
   2146         self.assertRaises(UnicodeDecodeError,
   2147             codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
   2148         )
   2149 
   2150         self.assertRaises(UnicodeDecodeError,
   2151             codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
   2152         )
   2153 
   2154         self.assertEqual(
   2155             codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
   2156             ("ab\ufffd", 3)
   2157         )
   2158 
   2159         self.assertEqual(
   2160             codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
   2161             ("ab\ufffd", 3)
   2162         )
   2163 
   2164         self.assertEqual(
   2165             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
   2166             ("ab\\x02", 3)
   2167         )
   2168 
   2169         self.assertEqual(
   2170             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
   2171             ("ab\\x02", 3)
   2172         )
   2173 
   2174         self.assertEqual(
   2175             codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
   2176             ("ab", 3)
   2177         )
   2178 
   2179         self.assertEqual(
   2180             codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
   2181             ("ab", 3)
   2182         )
   2183 
   2184         allbytes = bytes(range(256))
   2185         self.assertEqual(
   2186             codecs.charmap_decode(allbytes, "ignore", ""),
   2187             ("", len(allbytes))
   2188         )
   2189 
   2190     def test_decode_with_int2str_map(self):
   2191         self.assertEqual(
   2192             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2193                                   {0: 'a', 1: 'b', 2: 'c'}),
   2194             ("abc", 3)
   2195         )
   2196 
   2197         self.assertEqual(
   2198             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2199                                   {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
   2200             ("AaBbCc", 3)
   2201         )
   2202 
   2203         self.assertEqual(
   2204             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2205                                   {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
   2206             ("\U0010FFFFbc", 3)
   2207         )
   2208 
   2209         self.assertEqual(
   2210             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2211                                   {0: 'a', 1: 'b', 2: ''}),
   2212             ("ab", 3)
   2213         )
   2214 
   2215         self.assertRaises(UnicodeDecodeError,
   2216             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2217                                    {0: 'a', 1: 'b'}
   2218         )
   2219 
   2220         self.assertRaises(UnicodeDecodeError,
   2221             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2222                                    {0: 'a', 1: 'b', 2: None}
   2223         )
   2224 
   2225         # Issue #14850
   2226         self.assertRaises(UnicodeDecodeError,
   2227             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2228                                    {0: 'a', 1: 'b', 2: '\ufffe'}
   2229         )
   2230 
   2231         self.assertEqual(
   2232             codecs.charmap_decode(b"\x00\x01\x02", "replace",
   2233                                   {0: 'a', 1: 'b'}),
   2234             ("ab\ufffd", 3)
   2235         )
   2236 
   2237         self.assertEqual(
   2238             codecs.charmap_decode(b"\x00\x01\x02", "replace",
   2239                                   {0: 'a', 1: 'b', 2: None}),
   2240             ("ab\ufffd", 3)
   2241         )
   2242 
   2243         # Issue #14850
   2244         self.assertEqual(
   2245             codecs.charmap_decode(b"\x00\x01\x02", "replace",
   2246                                   {0: 'a', 1: 'b', 2: '\ufffe'}),
   2247             ("ab\ufffd", 3)
   2248         )
   2249 
   2250         self.assertEqual(
   2251             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
   2252                                   {0: 'a', 1: 'b'}),
   2253             ("ab\\x02", 3)
   2254         )
   2255 
   2256         self.assertEqual(
   2257             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
   2258                                   {0: 'a', 1: 'b', 2: None}),
   2259             ("ab\\x02", 3)
   2260         )
   2261 
   2262         # Issue #14850
   2263         self.assertEqual(
   2264             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
   2265                                   {0: 'a', 1: 'b', 2: '\ufffe'}),
   2266             ("ab\\x02", 3)
   2267         )
   2268 
   2269         self.assertEqual(
   2270             codecs.charmap_decode(b"\x00\x01\x02", "ignore",
   2271                                   {0: 'a', 1: 'b'}),
   2272             ("ab", 3)
   2273         )
   2274 
   2275         self.assertEqual(
   2276             codecs.charmap_decode(b"\x00\x01\x02", "ignore",
   2277                                   {0: 'a', 1: 'b', 2: None}),
   2278             ("ab", 3)
   2279         )
   2280 
   2281         # Issue #14850
   2282         self.assertEqual(
   2283             codecs.charmap_decode(b"\x00\x01\x02", "ignore",
   2284                                   {0: 'a', 1: 'b', 2: '\ufffe'}),
   2285             ("ab", 3)
   2286         )
   2287 
   2288         allbytes = bytes(range(256))
   2289         self.assertEqual(
   2290             codecs.charmap_decode(allbytes, "ignore", {}),
   2291             ("", len(allbytes))
   2292         )
   2293 
   2294     def test_decode_with_int2int_map(self):
   2295         a = ord('a')
   2296         b = ord('b')
   2297         c = ord('c')
   2298 
   2299         self.assertEqual(
   2300             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2301                                   {0: a, 1: b, 2: c}),
   2302             ("abc", 3)
   2303         )
   2304 
   2305         # Issue #15379
   2306         self.assertEqual(
   2307             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2308                                   {0: 0x10FFFF, 1: b, 2: c}),
   2309             ("\U0010FFFFbc", 3)
   2310         )
   2311 
   2312         self.assertEqual(
   2313             codecs.charmap_decode(b"\x00\x01\x02", "strict",
   2314                                   {0: sys.maxunicode, 1: b, 2: c}),
   2315             (chr(sys.maxunicode) + "bc", 3)
   2316         )
   2317 
   2318         self.assertRaises(TypeError,
   2319             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2320                                    {0: sys.maxunicode + 1, 1: b, 2: c}
   2321         )
   2322 
   2323         self.assertRaises(UnicodeDecodeError,
   2324             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2325                                    {0: a, 1: b},
   2326         )
   2327 
   2328         self.assertRaises(UnicodeDecodeError,
   2329             codecs.charmap_decode, b"\x00\x01\x02", "strict",
   2330                                    {0: a, 1: b, 2: 0xFFFE},
   2331         )
   2332 
   2333         self.assertEqual(
   2334             codecs.charmap_decode(b"\x00\x01\x02", "replace",
   2335                                   {0: a, 1: b}),
   2336             ("ab\ufffd", 3)
   2337         )
   2338 
   2339         self.assertEqual(
   2340             codecs.charmap_decode(b"\x00\x01\x02", "replace",
   2341                                   {0: a, 1: b, 2: 0xFFFE}),
   2342             ("ab\ufffd", 3)
   2343         )
   2344 
   2345         self.assertEqual(
   2346             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
   2347                                   {0: a, 1: b}),
   2348             ("ab\\x02", 3)
   2349         )
   2350 
   2351         self.assertEqual(
   2352             codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
   2353                                   {0: a, 1: b, 2: 0xFFFE}),
   2354             ("ab\\x02", 3)
   2355         )
   2356 
   2357         self.assertEqual(
   2358             codecs.charmap_decode(b"\x00\x01\x02", "ignore",
   2359                                   {0: a, 1: b}),
   2360             ("ab", 3)
   2361         )
   2362 
   2363         self.assertEqual(
   2364             codecs.charmap_decode(b"\x00\x01\x02", "ignore",
   2365                                   {0: a, 1: b, 2: 0xFFFE}),
   2366             ("ab", 3)
   2367         )
   2368 
   2369 
   2370 class WithStmtTest(unittest.TestCase):
   2371     def test_encodedfile(self):
   2372         f = io.BytesIO(b"\xc3\xbc")
   2373         with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
   2374             self.assertEqual(ef.read(), b"\xfc")
   2375         self.assertTrue(f.closed)
   2376 
   2377     def test_streamreaderwriter(self):
   2378         f = io.BytesIO(b"\xc3\xbc")
   2379         info = codecs.lookup("utf-8")
   2380         with codecs.StreamReaderWriter(f, info.streamreader,
   2381                                        info.streamwriter, 'strict') as srw:
   2382             self.assertEqual(srw.read(), "\xfc")
   2383 
   2384 
   2385 class TypesTest(unittest.TestCase):
   2386     def test_decode_unicode(self):
   2387         # Most decoders don't accept unicode input
   2388         decoders = [
   2389             codecs.utf_7_decode,
   2390             codecs.utf_8_decode,
   2391             codecs.utf_16_le_decode,
   2392             codecs.utf_16_be_decode,
   2393             codecs.utf_16_ex_decode,
   2394             codecs.utf_32_decode,
   2395             codecs.utf_32_le_decode,
   2396             codecs.utf_32_be_decode,
   2397             codecs.utf_32_ex_decode,
   2398             codecs.latin_1_decode,
   2399             codecs.ascii_decode,
   2400             codecs.charmap_decode,
   2401         ]
   2402         if hasattr(codecs, "mbcs_decode"):
   2403             decoders.append(codecs.mbcs_decode)
   2404         for decoder in decoders:
   2405             self.assertRaises(TypeError, decoder, "xxx")
   2406 
   2407     def test_unicode_escape(self):
   2408         # Escape-decoding a unicode string is supported and gives the same
   2409         # result as decoding the equivalent ASCII bytes string.
   2410         self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
   2411         self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
   2412         self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
   2413         self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
   2414 
   2415         self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
   2416         self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
   2417         self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
   2418                          (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
   2419 
   2420         self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
   2421         self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
   2422         self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
   2423                          (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
   2424 
   2425 
   2426 class UnicodeEscapeTest(unittest.TestCase):
   2427     def test_empty(self):
   2428         self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
   2429         self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
   2430 
   2431     def test_raw_encode(self):
   2432         encode = codecs.unicode_escape_encode
   2433         for b in range(32, 127):
   2434             if b != b'\\'[0]:
   2435                 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
   2436 
   2437     def test_raw_decode(self):
   2438         decode = codecs.unicode_escape_decode
   2439         for b in range(256):
   2440             if b != b'\\'[0]:
   2441                 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
   2442 
   2443     def test_escape_encode(self):
   2444         encode = codecs.unicode_escape_encode
   2445         check = coding_checker(self, encode)
   2446         check('\t', br'\t')
   2447         check('\n', br'\n')
   2448         check('\r', br'\r')
   2449         check('\\', br'\\')
   2450         for b in range(32):
   2451             if chr(b) not in '\t\n\r':
   2452                 check(chr(b), ('\\x%02x' % b).encode())
   2453         for b in range(127, 256):
   2454             check(chr(b), ('\\x%02x' % b).encode())
   2455         check('\u20ac', br'\u20ac')
   2456         check('\U0001d120', br'\U0001d120')
   2457 
   2458     def test_escape_decode(self):
   2459         decode = codecs.unicode_escape_decode
   2460         check = coding_checker(self, decode)
   2461         check(b"[\\\n]", "[]")
   2462         check(br'[\"]', '["]')
   2463         check(br"[\']", "[']")
   2464         check(br"[\\]", r"[\]")
   2465         check(br"[\a]", "[\x07]")
   2466         check(br"[\b]", "[\x08]")
   2467         check(br"[\t]", "[\x09]")
   2468         check(br"[\n]", "[\x0a]")
   2469         check(br"[\v]", "[\x0b]")
   2470         check(br"[\f]", "[\x0c]")
   2471         check(br"[\r]", "[\x0d]")
   2472         check(br"[\7]", "[\x07]")
   2473         check(br"[\78]", "[\x078]")
   2474         check(br"[\41]", "[!]")
   2475         check(br"[\418]", "[!8]")
   2476         check(br"[\101]", "[A]")
   2477         check(br"[\1010]", "[A0]")
   2478         check(br"[\x41]", "[A]")
   2479         check(br"[\x410]", "[A0]")
   2480         check(br"\u20ac", "\u20ac")
   2481         check(br"\U0001d120", "\U0001d120")
   2482         for i in range(97, 123):
   2483             b = bytes([i])
   2484             if b not in b'abfnrtuvx':
   2485                 with self.assertWarns(DeprecationWarning):
   2486                     check(b"\\" + b, "\\" + chr(i))
   2487             if b.upper() not in b'UN':
   2488                 with self.assertWarns(DeprecationWarning):
   2489                     check(b"\\" + b.upper(), "\\" + chr(i-32))
   2490         with self.assertWarns(DeprecationWarning):
   2491             check(br"\8", "\\8")
   2492         with self.assertWarns(DeprecationWarning):
   2493             check(br"\9", "\\9")
   2494         with self.assertWarns(DeprecationWarning):
   2495             check(b"\\\xfa", "\\\xfa")
   2496 
   2497     def test_decode_errors(self):
   2498         decode = codecs.unicode_escape_decode
   2499         for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
   2500             for i in range(d):
   2501                 self.assertRaises(UnicodeDecodeError, decode,
   2502                                   b"\\" + c + b"0"*i)
   2503                 self.assertRaises(UnicodeDecodeError, decode,
   2504                                   b"[\\" + c + b"0"*i + b"]")
   2505                 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
   2506                 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
   2507                 self.assertEqual(decode(data, "replace"),
   2508                                  ("[\ufffd]\ufffd", len(data)))
   2509         self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
   2510         self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
   2511         self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
   2512 
   2513 
   2514 class RawUnicodeEscapeTest(unittest.TestCase):
   2515     def test_empty(self):
   2516         self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
   2517         self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
   2518 
   2519     def test_raw_encode(self):
   2520         encode = codecs.raw_unicode_escape_encode
   2521         for b in range(256):
   2522             self.assertEqual(encode(chr(b)), (bytes([b]), 1))
   2523 
   2524     def test_raw_decode(self):
   2525         decode = codecs.raw_unicode_escape_decode
   2526         for b in range(256):
   2527             self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
   2528 
   2529     def test_escape_encode(self):
   2530         encode = codecs.raw_unicode_escape_encode
   2531         check = coding_checker(self, encode)
   2532         for b in range(256):
   2533             if b not in b'uU':
   2534                 check('\\' + chr(b), b'\\' + bytes([b]))
   2535         check('\u20ac', br'\u20ac')
   2536         check('\U0001d120', br'\U0001d120')
   2537 
   2538     def test_escape_decode(self):
   2539         decode = codecs.raw_unicode_escape_decode
   2540         check = coding_checker(self, decode)
   2541         for b in range(256):
   2542             if b not in b'uU':
   2543                 check(b'\\' + bytes([b]), '\\' + chr(b))
   2544         check(br"\u20ac", "\u20ac")
   2545         check(br"\U0001d120", "\U0001d120")
   2546 
   2547     def test_decode_errors(self):
   2548         decode = codecs.raw_unicode_escape_decode
   2549         for c, d in (b'u', 4), (b'U', 4):
   2550             for i in range(d):
   2551                 self.assertRaises(UnicodeDecodeError, decode,
   2552                                   b"\\" + c + b"0"*i)
   2553                 self.assertRaises(UnicodeDecodeError, decode,
   2554                                   b"[\\" + c + b"0"*i + b"]")
   2555                 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
   2556                 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
   2557                 self.assertEqual(decode(data, "replace"),
   2558                                  ("[\ufffd]\ufffd", len(data)))
   2559         self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
   2560         self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
   2561         self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
   2562 
   2563 
   2564 class EscapeEncodeTest(unittest.TestCase):
   2565 
   2566     def test_escape_encode(self):
   2567         tests = [
   2568             (b'', (b'', 0)),
   2569             (b'foobar', (b'foobar', 6)),
   2570             (b'spam\0eggs', (b'spam\\x00eggs', 9)),
   2571             (b'a\'b', (b"a\\'b", 3)),
   2572             (b'b\\c', (b'b\\\\c', 3)),
   2573             (b'c\nd', (b'c\\nd', 3)),
   2574             (b'd\re', (b'd\\re', 3)),
   2575             (b'f\x7fg', (b'f\\x7fg', 3)),
   2576         ]
   2577         for data, output in tests:
   2578             with self.subTest(data=data):
   2579                 self.assertEqual(codecs.escape_encode(data), output)
   2580         self.assertRaises(TypeError, codecs.escape_encode, 'spam')
   2581         self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
   2582 
   2583 
   2584 class SurrogateEscapeTest(unittest.TestCase):
   2585 
   2586     def test_utf8(self):
   2587         # Bad byte
   2588         self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
   2589                          "foo\udc80bar")
   2590         self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
   2591                          b"foo\x80bar")
   2592         # bad-utf-8 encoded surrogate
   2593         self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
   2594                          "\udced\udcb0\udc80")
   2595         self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
   2596                          b"\xed\xb0\x80")
   2597 
   2598     def test_ascii(self):
   2599         # bad byte
   2600         self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
   2601                          "foo\udc80bar")
   2602         self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
   2603                          b"foo\x80bar")
   2604 
   2605     def test_charmap(self):
   2606         # bad byte: \xa5 is unmapped in iso-8859-3
   2607         self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
   2608                          "foo\udca5bar")
   2609         self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
   2610                          b"foo\xa5bar")
   2611 
   2612     def test_latin1(self):
   2613         # Issue6373
   2614         self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
   2615                          b"\xe4\xeb\xef\xf6\xfc")
   2616 
   2617 
   2618 class BomTest(unittest.TestCase):
   2619     def test_seek0(self):
   2620         data = "1234567890"
   2621         tests = ("utf-16",
   2622                  "utf-16-le",
   2623                  "utf-16-be",
   2624                  "utf-32",
   2625                  "utf-32-le",
   2626                  "utf-32-be")
   2627         self.addCleanup(support.unlink, support.TESTFN)
   2628         for encoding in tests:
   2629             # Check if the BOM is written only once
   2630             with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
   2631                 f.write(data)
   2632                 f.write(data)
   2633                 f.seek(0)
   2634                 self.assertEqual(f.read(), data * 2)
   2635                 f.seek(0)
   2636                 self.assertEqual(f.read(), data * 2)
   2637 
   2638             # Check that the BOM is written after a seek(0)
   2639             with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
   2640                 f.write(data[0])
   2641                 self.assertNotEqual(f.tell(), 0)
   2642                 f.seek(0)
   2643                 f.write(data)
   2644                 f.seek(0)
   2645                 self.assertEqual(f.read(), data)
   2646 
   2647             # (StreamWriter) Check that the BOM is written after a seek(0)
   2648             with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
   2649                 f.writer.write(data[0])
   2650                 self.assertNotEqual(f.writer.tell(), 0)
   2651                 f.writer.seek(0)
   2652                 f.writer.write(data)
   2653                 f.seek(0)
   2654                 self.assertEqual(f.read(), data)
   2655 
   2656             # Check that the BOM is not written after a seek() at a position
   2657             # different than the start
   2658             with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
   2659                 f.write(data)
   2660                 f.seek(f.tell())
   2661                 f.write(data)
   2662                 f.seek(0)
   2663                 self.assertEqual(f.read(), data * 2)
   2664 
   2665             # (StreamWriter) Check that the BOM is not written after a seek()
   2666             # at a position different than the start
   2667             with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
   2668                 f.writer.write(data)
   2669                 f.writer.seek(f.writer.tell())
   2670                 f.writer.write(data)
   2671                 f.seek(0)
   2672                 self.assertEqual(f.read(), data * 2)
   2673 
   2674 
   2675 bytes_transform_encodings = [
   2676     "base64_codec",
   2677     "uu_codec",
   2678     "quopri_codec",
   2679     "hex_codec",
   2680 ]
   2681 
   2682 transform_aliases = {
   2683     "base64_codec": ["base64", "base_64"],
   2684     "uu_codec": ["uu"],
   2685     "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
   2686     "hex_codec": ["hex"],
   2687     "rot_13": ["rot13"],
   2688 }
   2689 
   2690 try:
   2691     import zlib
   2692 except ImportError:
   2693     zlib = None
   2694 else:
   2695     bytes_transform_encodings.append("zlib_codec")
   2696     transform_aliases["zlib_codec"] = ["zip", "zlib"]
   2697 try:
   2698     import bz2
   2699 except ImportError:
   2700     pass
   2701 else:
   2702     bytes_transform_encodings.append("bz2_codec")
   2703     transform_aliases["bz2_codec"] = ["bz2"]
   2704 
   2705 
   2706 class TransformCodecTest(unittest.TestCase):
   2707 
   2708     def test_basics(self):
   2709         binput = bytes(range(256))
   2710         for encoding in bytes_transform_encodings:
   2711             with self.subTest(encoding=encoding):
   2712                 # generic codecs interface
   2713                 (o, size) = codecs.getencoder(encoding)(binput)
   2714                 self.assertEqual(size, len(binput))
   2715                 (i, size) = codecs.getdecoder(encoding)(o)
   2716                 self.assertEqual(size, len(o))
   2717                 self.assertEqual(i, binput)
   2718 
   2719     def test_read(self):
   2720         for encoding in bytes_transform_encodings:
   2721             with self.subTest(encoding=encoding):
   2722                 sin = codecs.encode(b"\x80", encoding)
   2723                 reader = codecs.getreader(encoding)(io.BytesIO(sin))
   2724                 sout = reader.read()
   2725                 self.assertEqual(sout, b"\x80")
   2726 
   2727     def test_readline(self):
   2728         for encoding in bytes_transform_encodings:
   2729             with self.subTest(encoding=encoding):
   2730                 sin = codecs.encode(b"\x80", encoding)
   2731                 reader = codecs.getreader(encoding)(io.BytesIO(sin))
   2732                 sout = reader.readline()
   2733                 self.assertEqual(sout, b"\x80")
   2734 
   2735     def test_buffer_api_usage(self):
   2736         # We check all the transform codecs accept memoryview input
   2737         # for encoding and decoding
   2738         # and also that they roundtrip correctly
   2739         original = b"12345\x80"
   2740         for encoding in bytes_transform_encodings:
   2741             with self.subTest(encoding=encoding):
   2742                 data = original
   2743                 view = memoryview(data)
   2744                 data = codecs.encode(data, encoding)
   2745                 view_encoded = codecs.encode(view, encoding)
   2746                 self.assertEqual(view_encoded, data)
   2747                 view = memoryview(data)
   2748                 data = codecs.decode(data, encoding)
   2749                 self.assertEqual(data, original)
   2750                 view_decoded = codecs.decode(view, encoding)
   2751                 self.assertEqual(view_decoded, data)
   2752 
   2753     def test_text_to_binary_blacklists_binary_transforms(self):
   2754         # Check binary -> binary codecs give a good error for str input
   2755         bad_input = "bad input type"
   2756         for encoding in bytes_transform_encodings:
   2757             with self.subTest(encoding=encoding):
   2758                 fmt = (r"{!r} is not a text encoding; "
   2759                        r"use codecs.encode\(\) to handle arbitrary codecs")
   2760                 msg = fmt.format(encoding)
   2761                 with self.assertRaisesRegex(LookupError, msg) as failure:
   2762                     bad_input.encode(encoding)
   2763                 self.assertIsNone(failure.exception.__cause__)
   2764 
   2765     def test_text_to_binary_blacklists_text_transforms(self):
   2766         # Check str.encode gives a good error message for str -> str codecs
   2767         msg = (r"^'rot_13' is not a text encoding; "
   2768                r"use codecs.encode\(\) to handle arbitrary codecs")
   2769         with self.assertRaisesRegex(LookupError, msg):
   2770             "just an example message".encode("rot_13")
   2771 
   2772     def test_binary_to_text_blacklists_binary_transforms(self):
   2773         # Check bytes.decode and bytearray.decode give a good error
   2774         # message for binary -> binary codecs
   2775         data = b"encode first to ensure we meet any format restrictions"
   2776         for encoding in bytes_transform_encodings:
   2777             with self.subTest(encoding=encoding):
   2778                 encoded_data = codecs.encode(data, encoding)
   2779                 fmt = (r"{!r} is not a text encoding; "
   2780                        r"use codecs.decode\(\) to handle arbitrary codecs")
   2781                 msg = fmt.format(encoding)
   2782                 with self.assertRaisesRegex(LookupError, msg):
   2783                     encoded_data.decode(encoding)
   2784                 with self.assertRaisesRegex(LookupError, msg):
   2785                     bytearray(encoded_data).decode(encoding)
   2786 
   2787     def test_binary_to_text_blacklists_text_transforms(self):
   2788         # Check str -> str codec gives a good error for binary input
   2789         for bad_input in (b"immutable", bytearray(b"mutable")):
   2790             with self.subTest(bad_input=bad_input):
   2791                 msg = (r"^'rot_13' is not a text encoding; "
   2792                        r"use codecs.decode\(\) to handle arbitrary codecs")
   2793                 with self.assertRaisesRegex(LookupError, msg) as failure:
   2794                     bad_input.decode("rot_13")
   2795                 self.assertIsNone(failure.exception.__cause__)
   2796 
   2797     @unittest.skipUnless(zlib, "Requires zlib support")
   2798     def test_custom_zlib_error_is_wrapped(self):
   2799         # Check zlib codec gives a good error for malformed input
   2800         msg = "^decoding with 'zlib_codec' codec failed"
   2801         with self.assertRaisesRegex(Exception, msg) as failure:
   2802             codecs.decode(b"hello", "zlib_codec")
   2803         self.assertIsInstance(failure.exception.__cause__,
   2804                                                 type(failure.exception))
   2805 
   2806     def test_custom_hex_error_is_wrapped(self):
   2807         # Check hex codec gives a good error for malformed input
   2808         msg = "^decoding with 'hex_codec' codec failed"
   2809         with self.assertRaisesRegex(Exception, msg) as failure:
   2810             codecs.decode(b"hello", "hex_codec")
   2811         self.assertIsInstance(failure.exception.__cause__,
   2812                                                 type(failure.exception))
   2813 
   2814     # Unfortunately, the bz2 module throws OSError, which the codec
   2815     # machinery currently can't wrap :(
   2816 
   2817     # Ensure codec aliases from http://bugs.python.org/issue7475 work
   2818     def test_aliases(self):
   2819         for codec_name, aliases in transform_aliases.items():
   2820             expected_name = codecs.lookup(codec_name).name
   2821             for alias in aliases:
   2822                 with self.subTest(alias=alias):
   2823                     info = codecs.lookup(alias)
   2824                     self.assertEqual(info.name, expected_name)
   2825 
   2826     def test_quopri_stateless(self):
   2827         # Should encode with quotetabs=True
   2828         encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
   2829         self.assertEqual(encoded, b"space=20tab=09eol=20\n")
   2830         # But should still support unescaped tabs and spaces
   2831         unescaped = b"space tab eol\n"
   2832         self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
   2833 
   2834     def test_uu_invalid(self):
   2835         # Missing "begin" line
   2836         self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
   2837 
   2838 
   2839 # The codec system tries to wrap exceptions in order to ensure the error
   2840 # mentions the operation being performed and the codec involved. We
   2841 # currently *only* want this to happen for relatively stateless
   2842 # exceptions, where the only significant information they contain is their
   2843 # type and a single str argument.
   2844 
   2845 # Use a local codec registry to avoid appearing to leak objects when
   2846 # registering multiple search functions
   2847 _TEST_CODECS = {}
   2848 
   2849 def _get_test_codec(codec_name):
   2850     return _TEST_CODECS.get(codec_name)
   2851 codecs.register(_get_test_codec) # Returns None, not usable as a decorator
   2852 
   2853 try:
   2854     # Issue #22166: Also need to clear the internal cache in CPython
   2855     from _codecs import _forget_codec
   2856 except ImportError:
   2857     def _forget_codec(codec_name):
   2858         pass
   2859 
   2860 
   2861 class ExceptionChainingTest(unittest.TestCase):
   2862 
   2863     def setUp(self):
   2864         # There's no way to unregister a codec search function, so we just
   2865         # ensure we render this one fairly harmless after the test
   2866         # case finishes by using the test case repr as the codec name
   2867         # The codecs module normalizes codec names, although this doesn't
   2868         # appear to be formally documented...
   2869         # We also make sure we use a truly unique id for the custom codec
   2870         # to avoid issues with the codec cache when running these tests
   2871         # multiple times (e.g. when hunting for refleaks)
   2872         unique_id = repr(self) + str(id(self))
   2873         self.codec_name = encodings.normalize_encoding(unique_id).lower()
   2874 
   2875         # We store the object to raise on the instance because of a bad
   2876         # interaction between the codec caching (which means we can't
   2877         # recreate the codec entry) and regrtest refleak hunting (which
   2878         # runs the same test instance multiple times). This means we
   2879         # need to ensure the codecs call back in to the instance to find
   2880         # out which exception to raise rather than binding them in a
   2881         # closure to an object that may change on the next run
   2882         self.obj_to_raise = RuntimeError
   2883 
   2884     def tearDown(self):
   2885         _TEST_CODECS.pop(self.codec_name, None)
   2886         # Issue #22166: Also pop from caches to avoid appearance of ref leaks
   2887         encodings._cache.pop(self.codec_name, None)
   2888         try:
   2889             _forget_codec(self.codec_name)
   2890         except KeyError:
   2891             pass
   2892 
   2893     def set_codec(self, encode, decode):
   2894         codec_info = codecs.CodecInfo(encode, decode,
   2895                                       name=self.codec_name)
   2896         _TEST_CODECS[self.codec_name] = codec_info
   2897 
   2898     @contextlib.contextmanager
   2899     def assertWrapped(self, operation, exc_type, msg):
   2900         full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
   2901                   operation, self.codec_name, exc_type.__name__, msg)
   2902         with self.assertRaisesRegex(exc_type, full_msg) as caught:
   2903             yield caught
   2904         self.assertIsInstance(caught.exception.__cause__, exc_type)
   2905         self.assertIsNotNone(caught.exception.__cause__.__traceback__)
   2906 
   2907     def raise_obj(self, *args, **kwds):
   2908         # Helper to dynamically change the object raised by a test codec
   2909         raise self.obj_to_raise
   2910 
   2911     def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
   2912         self.obj_to_raise = obj_to_raise
   2913         self.set_codec(self.raise_obj, self.raise_obj)
   2914         with self.assertWrapped("encoding", exc_type, msg):
   2915             "str_input".encode(self.codec_name)
   2916         with self.assertWrapped("encoding", exc_type, msg):
   2917             codecs.encode("str_input", self.codec_name)
   2918         with self.assertWrapped("decoding", exc_type, msg):
   2919             b"bytes input".decode(self.codec_name)
   2920         with self.assertWrapped("decoding", exc_type, msg):
   2921             codecs.decode(b"bytes input", self.codec_name)
   2922 
   2923     def test_raise_by_type(self):
   2924         self.check_wrapped(RuntimeError, "")
   2925 
   2926     def test_raise_by_value(self):
   2927         msg = "This should be wrapped"
   2928         self.check_wrapped(RuntimeError(msg), msg)
   2929 
   2930     def test_raise_grandchild_subclass_exact_size(self):
   2931         msg = "This should be wrapped"
   2932         class MyRuntimeError(RuntimeError):
   2933             __slots__ = ()
   2934         self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
   2935 
   2936     def test_raise_subclass_with_weakref_support(self):
   2937         msg = "This should be wrapped"
   2938         class MyRuntimeError(RuntimeError):
   2939             pass
   2940         self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
   2941 
   2942     def check_not_wrapped(self, obj_to_raise, msg):
   2943         def raise_obj(*args, **kwds):
   2944             raise obj_to_raise
   2945         self.set_codec(raise_obj, raise_obj)
   2946         with self.assertRaisesRegex(RuntimeError, msg):
   2947             "str input".encode(self.codec_name)
   2948         with self.assertRaisesRegex(RuntimeError, msg):
   2949             codecs.encode("str input", self.codec_name)
   2950         with self.assertRaisesRegex(RuntimeError, msg):
   2951             b"bytes input".decode(self.codec_name)
   2952         with self.assertRaisesRegex(RuntimeError, msg):
   2953             codecs.decode(b"bytes input", self.codec_name)
   2954 
   2955     def test_init_override_is_not_wrapped(self):
   2956         class CustomInit(RuntimeError):
   2957             def __init__(self):
   2958                 pass
   2959         self.check_not_wrapped(CustomInit, "")
   2960 
   2961     def test_new_override_is_not_wrapped(self):
   2962         class CustomNew(RuntimeError):
   2963             def __new__(cls):
   2964                 return super().__new__(cls)
   2965         self.check_not_wrapped(CustomNew, "")
   2966 
   2967     def test_instance_attribute_is_not_wrapped(self):
   2968         msg = "This should NOT be wrapped"
   2969         exc = RuntimeError(msg)
   2970         exc.attr = 1
   2971         self.check_not_wrapped(exc, "^{}$".format(msg))
   2972 
   2973     def test_non_str_arg_is_not_wrapped(self):
   2974         self.check_not_wrapped(RuntimeError(1), "1")
   2975 
   2976     def test_multiple_args_is_not_wrapped(self):
   2977         msg_re = r"^\('a', 'b', 'c'\)$"
   2978         self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
   2979 
   2980     # http://bugs.python.org/issue19609
   2981     def test_codec_lookup_failure_not_wrapped(self):
   2982         msg = "^unknown encoding: {}$".format(self.codec_name)
   2983         # The initial codec lookup should not be wrapped
   2984         with self.assertRaisesRegex(LookupError, msg):
   2985             "str input".encode(self.codec_name)
   2986         with self.assertRaisesRegex(LookupError, msg):
   2987             codecs.encode("str input", self.codec_name)
   2988         with self.assertRaisesRegex(LookupError, msg):
   2989             b"bytes input".decode(self.codec_name)
   2990         with self.assertRaisesRegex(LookupError, msg):
   2991             codecs.decode(b"bytes input", self.codec_name)
   2992 
   2993     def test_unflagged_non_text_codec_handling(self):
   2994         # The stdlib non-text codecs are now marked so they're
   2995         # pre-emptively skipped by the text model related methods
   2996         # However, third party codecs won't be flagged, so we still make
   2997         # sure the case where an inappropriate output type is produced is
   2998         # handled appropriately
   2999         def encode_to_str(*args, **kwds):
   3000             return "not bytes!", 0
   3001         def decode_to_bytes(*args, **kwds):
   3002             return b"not str!", 0
   3003         self.set_codec(encode_to_str, decode_to_bytes)
   3004         # No input or output type checks on the codecs module functions
   3005         encoded = codecs.encode(None, self.codec_name)
   3006         self.assertEqual(encoded, "not bytes!")
   3007         decoded = codecs.decode(None, self.codec_name)
   3008         self.assertEqual(decoded, b"not str!")
   3009         # Text model methods should complain
   3010         fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
   3011                r"use codecs.encode\(\) to encode to arbitrary types$")
   3012         msg = fmt.format(self.codec_name)
   3013         with self.assertRaisesRegex(TypeError, msg):
   3014             "str_input".encode(self.codec_name)
   3015         fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
   3016                r"use codecs.decode\(\) to decode to arbitrary types$")
   3017         msg = fmt.format(self.codec_name)
   3018         with self.assertRaisesRegex(TypeError, msg):
   3019             b"bytes input".decode(self.codec_name)
   3020 
   3021 
   3022 
   3023 @unittest.skipUnless(sys.platform == 'win32',
   3024                      'code pages are specific to Windows')
   3025 class CodePageTest(unittest.TestCase):
   3026     # CP_UTF8 is already tested by CP65001Test
   3027     CP_UTF8 = 65001
   3028 
   3029     def test_invalid_code_page(self):
   3030         self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
   3031         self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
   3032         self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
   3033         self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
   3034 
   3035     def test_code_page_name(self):
   3036         self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
   3037             codecs.code_page_encode, 932, '\xff')
   3038         self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
   3039             codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
   3040         self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
   3041             codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
   3042 
   3043     def check_decode(self, cp, tests):
   3044         for raw, errors, expected in tests:
   3045             if expected is not None:
   3046                 try:
   3047                     decoded = codecs.code_page_decode(cp, raw, errors, True)
   3048                 except UnicodeDecodeError as err:
   3049                     self.fail('Unable to decode %a from "cp%s" with '
   3050                               'errors=%r: %s' % (raw, cp, errors, err))
   3051                 self.assertEqual(decoded[0], expected,
   3052                     '%a.decode("cp%s", %r)=%a != %a'
   3053                     % (raw, cp, errors, decoded[0], expected))
   3054                 # assert 0 <= decoded[1] <= len(raw)
   3055                 self.assertGreaterEqual(decoded[1], 0)
   3056                 self.assertLessEqual(decoded[1], len(raw))
   3057             else:
   3058                 self.assertRaises(UnicodeDecodeError,
   3059                     codecs.code_page_decode, cp, raw, errors, True)
   3060 
   3061     def check_encode(self, cp, tests):
   3062         for text, errors, expected in tests:
   3063             if expected is not None:
   3064                 try:
   3065                     encoded = codecs.code_page_encode(cp, text, errors)
   3066                 except UnicodeEncodeError as err:
   3067                     self.fail('Unable to encode %a to "cp%s" with '
   3068                               'errors=%r: %s' % (text, cp, errors, err))
   3069                 self.assertEqual(encoded[0], expected,
   3070                     '%a.encode("cp%s", %r)=%a != %a'
   3071                     % (text, cp, errors, encoded[0], expected))
   3072                 self.assertEqual(encoded[1], len(text))
   3073             else:
   3074                 self.assertRaises(UnicodeEncodeError,
   3075                     codecs.code_page_encode, cp, text, errors)
   3076 
   3077     def test_cp932(self):
   3078         self.check_encode(932, (
   3079             ('abc', 'strict', b'abc'),
   3080             ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
   3081             # test error handlers
   3082             ('\xff', 'strict', None),
   3083             ('[\xff]', 'ignore', b'[]'),
   3084             ('[\xff]', 'replace', b'[y]'),
   3085             ('[\u20ac]', 'replace', b'[?]'),
   3086             ('[\xff]', 'backslashreplace', b'[\\xff]'),
   3087             ('[\xff]', 'namereplace',
   3088              b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
   3089             ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
   3090             ('\udcff', 'strict', None),
   3091             ('[\udcff]', 'surrogateescape', b'[\xff]'),
   3092             ('[\udcff]', 'surrogatepass', None),
   3093         ))
   3094         self.check_decode(932, (
   3095             (b'abc', 'strict', 'abc'),
   3096             (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
   3097             # invalid bytes
   3098             (b'[\xff]', 'strict', None),
   3099             (b'[\xff]', 'ignore', '[]'),
   3100             (b'[\xff]', 'replace', '[\ufffd]'),
   3101             (b'[\xff]', 'backslashreplace', '[\\xff]'),
   3102             (b'[\xff]', 'surrogateescape', '[\udcff]'),
   3103             (b'[\xff]', 'surrogatepass', None),
   3104             (b'\x81\x00abc', 'strict', None),
   3105             (b'\x81\x00abc', 'ignore', '\x00abc'),
   3106             (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
   3107             (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
   3108         ))
   3109 
   3110     def test_cp1252(self):
   3111         self.check_encode(1252, (
   3112             ('abc', 'strict', b'abc'),
   3113             ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
   3114             ('\xff', 'strict', b'\xff'),
   3115             # test error handlers
   3116             ('\u0141', 'strict', None),
   3117             ('\u0141', 'ignore', b''),
   3118             ('\u0141', 'replace', b'L'),
   3119             ('\udc98', 'surrogateescape', b'\x98'),
   3120             ('\udc98', 'surrogatepass', None),
   3121         ))
   3122         self.check_decode(1252, (
   3123             (b'abc', 'strict', 'abc'),
   3124             (b'\xe9\x80', 'strict', '\xe9\u20ac'),
   3125             (b'\xff', 'strict', '\xff'),
   3126         ))
   3127 
   3128     def test_cp_utf7(self):
   3129         cp = 65000
   3130         self.check_encode(cp, (
   3131             ('abc', 'strict', b'abc'),
   3132             ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
   3133             ('\U0010ffff', 'strict',  b'+2//f/w-'),
   3134             ('\udc80', 'strict', b'+3IA-'),
   3135             ('\ufffd', 'strict', b'+//0-'),
   3136         ))
   3137         self.check_decode(cp, (
   3138             (b'abc', 'strict', 'abc'),
   3139             (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
   3140             (b'+2//f/w-', 'strict', '\U0010ffff'),
   3141             (b'+3IA-', 'strict', '\udc80'),
   3142             (b'+//0-', 'strict', '\ufffd'),
   3143             # invalid bytes
   3144             (b'[+/]', 'strict', '[]'),
   3145             (b'[\xff]', 'strict', '[\xff]'),
   3146         ))
   3147 
   3148     def test_multibyte_encoding(self):
   3149         self.check_decode(932, (
   3150             (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
   3151             (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
   3152         ))
   3153         self.check_decode(self.CP_UTF8, (
   3154             (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
   3155             (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
   3156         ))
   3157         self.check_encode(self.CP_UTF8, (
   3158             ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
   3159             ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
   3160         ))
   3161 
   3162     def test_incremental(self):
   3163         decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
   3164         self.assertEqual(decoded, ('', 0))
   3165 
   3166         decoded = codecs.code_page_decode(932,
   3167                                           b'\xe9\x80\xe9', 'strict',
   3168                                           False)
   3169         self.assertEqual(decoded, ('\u9a3e', 2))
   3170 
   3171         decoded = codecs.code_page_decode(932,
   3172                                           b'\xe9\x80\xe9\x80', 'strict',
   3173                                           False)
   3174         self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
   3175 
   3176         decoded = codecs.code_page_decode(932,
   3177                                           b'abc', 'strict',
   3178                                           False)
   3179         self.assertEqual(decoded, ('abc', 3))
   3180 
   3181     def test_mbcs_alias(self):
   3182         # Check that looking up our 'default' codepage will return
   3183         # mbcs when we don't have a more specific one available
   3184         with mock.patch('_winapi.GetACP', return_value=123):
   3185             codec = codecs.lookup('cp123')
   3186             self.assertEqual(codec.name, 'mbcs')
   3187 
   3188     @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
   3189     def test_large_input(self):
   3190         # Test input longer than INT_MAX.
   3191         # Input should contain undecodable bytes before and after
   3192         # the INT_MAX limit.
   3193         encoded = (b'01234567' * (2**28-1) +
   3194                    b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
   3195         self.assertEqual(len(encoded), 2**31+2)
   3196         decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
   3197         self.assertEqual(decoded[1], len(encoded))
   3198         del encoded
   3199         self.assertEqual(len(decoded[0]), decoded[1])
   3200         self.assertEqual(decoded[0][:10], '0123456701')
   3201         self.assertEqual(decoded[0][-20:],
   3202                          '6701234567'
   3203                          '\udc85\udc86\udcea\udceb\udcec'
   3204                          '\udcef\udcfc\udcfd\udcfe\udcff')
   3205 
   3206 
   3207 class ASCIITest(unittest.TestCase):
   3208     def test_encode(self):
   3209         self.assertEqual('abc123'.encode('ascii'), b'abc123')
   3210 
   3211     def test_encode_error(self):
   3212         for data, error_handler, expected in (
   3213             ('[\x80\xff\u20ac]', 'ignore', b'[]'),
   3214             ('[\x80\xff\u20ac]', 'replace', b'[???]'),
   3215             ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
   3216             ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
   3217              b'[\\x80\\xff\\u20ac\\U000abcde]'),
   3218             ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
   3219         ):
   3220             with self.subTest(data=data, error_handler=error_handler,
   3221                               expected=expected):
   3222                 self.assertEqual(data.encode('ascii', error_handler),
   3223                                  expected)
   3224 
   3225     def test_encode_surrogateescape_error(self):
   3226         with self.assertRaises(UnicodeEncodeError):
   3227             # the first character can be decoded, but not the second
   3228             '\udc80\xff'.encode('ascii', 'surrogateescape')
   3229 
   3230     def test_decode(self):
   3231         self.assertEqual(b'abc'.decode('ascii'), 'abc')
   3232 
   3233     def test_decode_error(self):
   3234         for data, error_handler, expected in (
   3235             (b'[\x80\xff]', 'ignore', '[]'),
   3236             (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
   3237             (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
   3238             (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
   3239         ):
   3240             with self.subTest(data=data, error_handler=error_handler,
   3241                               expected=expected):
   3242                 self.assertEqual(data.decode('ascii', error_handler),
   3243                                  expected)
   3244 
   3245 
   3246 class Latin1Test(unittest.TestCase):
   3247     def test_encode(self):
   3248         for data, expected in (
   3249             ('abc', b'abc'),
   3250             ('\x80\xe9\xff', b'\x80\xe9\xff'),
   3251         ):
   3252             with self.subTest(data=data, expected=expected):
   3253                 self.assertEqual(data.encode('latin1'), expected)
   3254 
   3255     def test_encode_errors(self):
   3256         for data, error_handler, expected in (
   3257             ('[\u20ac\udc80]', 'ignore', b'[]'),
   3258             ('[\u20ac\udc80]', 'replace', b'[??]'),
   3259             ('[\u20ac\U000abcde]', 'backslashreplace',
   3260              b'[\\u20ac\\U000abcde]'),
   3261             ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
   3262             ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
   3263         ):
   3264             with self.subTest(data=data, error_handler=error_handler,
   3265                               expected=expected):
   3266                 self.assertEqual(data.encode('latin1', error_handler),
   3267                                  expected)
   3268 
   3269     def test_encode_surrogateescape_error(self):
   3270         with self.assertRaises(UnicodeEncodeError):
   3271             # the first character can be decoded, but not the second
   3272             '\udc80\u20ac'.encode('latin1', 'surrogateescape')
   3273 
   3274     def test_decode(self):
   3275         for data, expected in (
   3276             (b'abc', 'abc'),
   3277             (b'[\x80\xff]', '[\x80\xff]'),
   3278         ):
   3279             with self.subTest(data=data, expected=expected):
   3280                 self.assertEqual(data.decode('latin1'), expected)
   3281 
   3282 
   3283 if __name__ == "__main__":
   3284     unittest.main()
   3285