Home | History | Annotate | Download | only in test
      1 from test import test_support
      2 import unittest
      3 import codecs
      4 import locale
      5 import sys, StringIO
      6 
      7 def coding_checker(self, coder):
      8     def check(input, expect):
      9         self.assertEqual(coder(input), (expect, len(input)))
     10     return check
     11 
     12 class Queue(object):
     13     """
     14     queue: write bytes at one end, read bytes from the other end
     15     """
     16     def __init__(self):
     17         self._buffer = ""
     18 
     19     def write(self, chars):
     20         self._buffer += chars
     21 
     22     def read(self, size=-1):
     23         if size<0:
     24             s = self._buffer
     25             self._buffer = ""
     26             return s
     27         else:
     28             s = self._buffer[:size]
     29             self._buffer = self._buffer[size:]
     30             return s
     31 
     32 class ReadTest(unittest.TestCase):
     33     def check_partial(self, input, partialresults):
     34         # get a StreamReader for the encoding and feed the bytestring version
     35         # of input to the reader byte by byte. Read everything available from
     36         # the StreamReader and check that the results equal the appropriate
     37         # entries from partialresults.
     38         q = Queue()
     39         r = codecs.getreader(self.encoding)(q)
     40         result = u""
     41         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     42             q.write(c)
     43             result += r.read()
     44             self.assertEqual(result, partialresult)
     45         # check that there's nothing left in the buffers
     46         self.assertEqual(r.read(), u"")
     47         self.assertEqual(r.bytebuffer, "")
     48         self.assertEqual(r.charbuffer, u"")
     49 
     50         # do the check again, this time using an incremental decoder
     51         d = codecs.getincrementaldecoder(self.encoding)()
     52         result = u""
     53         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     54             result += d.decode(c)
     55             self.assertEqual(result, partialresult)
     56         # check that there's nothing left in the buffers
     57         self.assertEqual(d.decode("", True), u"")
     58         self.assertEqual(d.buffer, "")
     59 
     60         # Check whether the reset method works properly
     61         d.reset()
     62         result = u""
     63         for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
     64             result += d.decode(c)
     65             self.assertEqual(result, partialresult)
     66         # check that there's nothing left in the buffers
     67         self.assertEqual(d.decode("", True), u"")
     68         self.assertEqual(d.buffer, "")
     69 
     70         # check iterdecode()
     71         encoded = input.encode(self.encoding)
     72         self.assertEqual(
     73             input,
     74             u"".join(codecs.iterdecode(encoded, self.encoding))
     75         )
     76 
     77     def test_readline(self):
     78         def getreader(input):
     79             stream = StringIO.StringIO(input.encode(self.encoding))
     80             return codecs.getreader(self.encoding)(stream)
     81 
     82         def readalllines(input, keepends=True, size=None):
     83             reader = getreader(input)
     84             lines = []
     85             while True:
     86                 line = reader.readline(size=size, keepends=keepends)
     87                 if not line:
     88                     break
     89                 lines.append(line)
     90             return "|".join(lines)
     91 
     92         s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
     93         sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
     94         sexpectednoends = u"foo|bar|baz|spam|eggs"
     95         self.assertEqual(readalllines(s, True), sexpected)
     96         self.assertEqual(readalllines(s, False), sexpectednoends)
     97         self.assertEqual(readalllines(s, True, 10), sexpected)
     98         self.assertEqual(readalllines(s, False, 10), sexpectednoends)
     99 
    100         lineends = ("\n", "\r\n", "\r", u"\u2028")
    101         # Test long lines (multiple calls to read() in readline())
    102         vw = []
    103         vwo = []
    104         for (i, lineend) in enumerate(lineends):
    105             vw.append((i*200+200)*u"\u3042" + lineend)
    106             vwo.append((i*200+200)*u"\u3042")
    107         self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
    108         self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
    109 
    110         # Test lines where the first read might end with \r, so the
    111         # reader has to look ahead whether this is a lone \r or a \r\n
    112         for size in xrange(80):
    113             for lineend in lineends:
    114                 s = 10*(size*u"a" + lineend + u"xxx\n")
    115                 reader = getreader(s)
    116                 for i in xrange(10):
    117                     self.assertEqual(
    118                         reader.readline(keepends=True),
    119                         size*u"a" + lineend,
    120                     )
    121                     self.assertEqual(
    122                         reader.readline(keepends=True),
    123                         "xxx\n",
    124                     )
    125                 reader = getreader(s)
    126                 for i in xrange(10):
    127                     self.assertEqual(
    128                         reader.readline(keepends=False),
    129                         size*u"a",
    130                     )
    131                     self.assertEqual(
    132                         reader.readline(keepends=False),
    133                         "xxx",
    134                     )
    135 
    136     def test_mixed_readline_and_read(self):
    137         lines = ["Humpty Dumpty sat on a wall,\n",
    138                  "Humpty Dumpty had a great fall.\r\n",
    139                  "All the king's horses and all the king's men\r",
    140                  "Couldn't put Humpty together again."]
    141         data = ''.join(lines)
    142         def getreader():
    143             stream = StringIO.StringIO(data.encode(self.encoding))
    144             return codecs.getreader(self.encoding)(stream)
    145 
    146         # Issue #8260: Test readline() followed by read()
    147         f = getreader()
    148         self.assertEqual(f.readline(), lines[0])
    149         self.assertEqual(f.read(), ''.join(lines[1:]))
    150         self.assertEqual(f.read(), '')
    151 
    152         # Issue #16636: Test readline() followed by readlines()
    153         f = getreader()
    154         self.assertEqual(f.readline(), lines[0])
    155         self.assertEqual(f.readlines(), lines[1:])
    156         self.assertEqual(f.read(), '')
    157 
    158         # Test read() followed by read()
    159         f = getreader()
    160         self.assertEqual(f.read(size=40, chars=5), data[:5])
    161         self.assertEqual(f.read(), data[5:])
    162         self.assertEqual(f.read(), '')
    163 
    164         # Issue #12446: Test read() followed by readlines()
    165         f = getreader()
    166         self.assertEqual(f.read(size=40, chars=5), data[:5])
    167         self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
    168         self.assertEqual(f.read(), '')
    169 
    170     def test_bug1175396(self):
    171         s = [
    172             '<%!--===================================================\r\n',
    173             '    BLOG index page: show recent articles,\r\n',
    174             '    today\'s articles, or articles of a specific date.\r\n',
    175             '========================================================--%>\r\n',
    176             '<%@inputencoding="ISO-8859-1"%>\r\n',
    177             '<%@pagetemplate=TEMPLATE.y%>\r\n',
    178             '<%@import=import frog.util, frog%>\r\n',
    179             '<%@import=import frog.objects%>\r\n',
    180             '<%@import=from frog.storageerrors import StorageError%>\r\n',
    181             '<%\r\n',
    182             '\r\n',
    183             'import logging\r\n',
    184             'log=logging.getLogger("Snakelets.logger")\r\n',
    185             '\r\n',
    186             '\r\n',
    187             'user=self.SessionCtx.user\r\n',
    188             'storageEngine=self.SessionCtx.storageEngine\r\n',
    189             '\r\n',
    190             '\r\n',
    191             'def readArticlesFromDate(date, count=None):\r\n',
    192             '    entryids=storageEngine.listBlogEntries(date)\r\n',
    193             '    entryids.reverse() # descending\r\n',
    194             '    if count:\r\n',
    195             '        entryids=entryids[:count]\r\n',
    196             '    try:\r\n',
    197             '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
    198             '    except StorageError,x:\r\n',
    199             '        log.error("Error loading articles: "+str(x))\r\n',
    200             '        self.abort("cannot load articles")\r\n',
    201             '\r\n',
    202             'showdate=None\r\n',
    203             '\r\n',
    204             'arg=self.Request.getArg()\r\n',
    205             'if arg=="today":\r\n',
    206             '    #-------------------- TODAY\'S ARTICLES\r\n',
    207             '    self.write("<h2>Today\'s articles</h2>")\r\n',
    208             '    showdate = frog.util.isodatestr() \r\n',
    209             '    entries = readArticlesFromDate(showdate)\r\n',
    210             'elif arg=="active":\r\n',
    211             '    #-------------------- ACTIVE ARTICLES redirect\r\n',
    212             '    self.Yredirect("active.y")\r\n',
    213             'elif arg=="login":\r\n',
    214             '    #-------------------- LOGIN PAGE redirect\r\n',
    215             '    self.Yredirect("login.y")\r\n',
    216             'elif arg=="date":\r\n',
    217             '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
    218             '    showdate = self.Request.getParameter("date")\r\n',
    219             '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
    220             '    entries = readArticlesFromDate(showdate)\r\n',
    221             'else:\r\n',
    222             '    #-------------------- RECENT ARTICLES\r\n',
    223             '    self.write("<h2>Recent articles</h2>")\r\n',
    224             '    dates=storageEngine.listBlogEntryDates()\r\n',
    225             '    if dates:\r\n',
    226             '        entries=[]\r\n',
    227             '        SHOWAMOUNT=10\r\n',
    228             '        for showdate in dates:\r\n',
    229             '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
    230             '            if len(entries)>=SHOWAMOUNT:\r\n',
    231             '                break\r\n',
    232             '                \r\n',
    233         ]
    234         stream = StringIO.StringIO("".join(s).encode(self.encoding))
    235         reader = codecs.getreader(self.encoding)(stream)
    236         for (i, line) in enumerate(reader):
    237             self.assertEqual(line, s[i])
    238 
    239     def test_readlinequeue(self):
    240         q = Queue()
    241         writer = codecs.getwriter(self.encoding)(q)
    242         reader = codecs.getreader(self.encoding)(q)
    243 
    244         # No lineends
    245         writer.write(u"foo\r")
    246         self.assertEqual(reader.readline(keepends=False), u"foo")
    247         writer.write(u"\nbar\r")
    248         self.assertEqual(reader.readline(keepends=False), u"")
    249         self.assertEqual(reader.readline(keepends=False), u"bar")
    250         writer.write(u"baz")
    251         self.assertEqual(reader.readline(keepends=False), u"baz")
    252         self.assertEqual(reader.readline(keepends=False), u"")
    253 
    254         # Lineends
    255         writer.write(u"foo\r")
    256         self.assertEqual(reader.readline(keepends=True), u"foo\r")
    257         writer.write(u"\nbar\r")
    258         self.assertEqual(reader.readline(keepends=True), u"\n")
    259         self.assertEqual(reader.readline(keepends=True), u"bar\r")
    260         writer.write(u"baz")
    261         self.assertEqual(reader.readline(keepends=True), u"baz")
    262         self.assertEqual(reader.readline(keepends=True), u"")
    263         writer.write(u"foo\r\n")
    264         self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
    265 
    266     def test_bug1098990_a(self):
    267         s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
    268         s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
    269         s3 = u"next line.\r\n"
    270 
    271         s = (s1+s2+s3).encode(self.encoding)
    272         stream = StringIO.StringIO(s)
    273         reader = codecs.getreader(self.encoding)(stream)
    274         self.assertEqual(reader.readline(), s1)
    275         self.assertEqual(reader.readline(), s2)
    276         self.assertEqual(reader.readline(), s3)
    277         self.assertEqual(reader.readline(), u"")
    278 
    279     def test_bug1098990_b(self):
    280         s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
    281         s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
    282         s3 = u"stillokay:bbbbxx\r\n"
    283         s4 = u"broken!!!!badbad\r\n"
    284         s5 = u"againokay.\r\n"
    285 
    286         s = (s1+s2+s3+s4+s5).encode(self.encoding)
    287         stream = StringIO.StringIO(s)
    288         reader = codecs.getreader(self.encoding)(stream)
    289         self.assertEqual(reader.readline(), s1)
    290         self.assertEqual(reader.readline(), s2)
    291         self.assertEqual(reader.readline(), s3)
    292         self.assertEqual(reader.readline(), s4)
    293         self.assertEqual(reader.readline(), s5)
    294         self.assertEqual(reader.readline(), u"")
    295 
    296 class UTF32Test(ReadTest):
    297     encoding = "utf-32"
    298 
    299     spamle = ('\xff\xfe\x00\x00'
    300               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
    301               's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
    302     spambe = ('\x00\x00\xfe\xff'
    303               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
    304               '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
    305 
    306     def test_only_one_bom(self):
    307         _,_,reader,writer = codecs.lookup(self.encoding)
    308         # encode some stream
    309         s = StringIO.StringIO()
    310         f = writer(s)
    311         f.write(u"spam")
    312         f.write(u"spam")
    313         d = s.getvalue()
    314         # check whether there is exactly one BOM in it
    315         self.assertTrue(d == self.spamle or d == self.spambe)
    316         # try to read it back
    317         s = StringIO.StringIO(d)
    318         f = reader(s)
    319         self.assertEqual(f.read(), u"spamspam")
    320 
    321     def test_badbom(self):
    322         s = StringIO.StringIO(4*"\xff")
    323         f = codecs.getreader(self.encoding)(s)
    324         self.assertRaises(UnicodeError, f.read)
    325 
    326         s = StringIO.StringIO(8*"\xff")
    327         f = codecs.getreader(self.encoding)(s)
    328         self.assertRaises(UnicodeError, f.read)
    329 
    330     def test_partial(self):
    331         self.check_partial(
    332             u"\x00\xff\u0100\uffff\U00010000",
    333             [
    334                 u"", # first byte of BOM read
    335                 u"", # second byte of BOM read
    336                 u"", # third byte of BOM read
    337                 u"", # fourth byte of BOM read => byteorder known
    338                 u"",
    339                 u"",
    340                 u"",
    341                 u"\x00",
    342                 u"\x00",
    343                 u"\x00",
    344                 u"\x00",
    345                 u"\x00\xff",
    346                 u"\x00\xff",
    347                 u"\x00\xff",
    348                 u"\x00\xff",
    349                 u"\x00\xff\u0100",
    350                 u"\x00\xff\u0100",
    351                 u"\x00\xff\u0100",
    352                 u"\x00\xff\u0100",
    353                 u"\x00\xff\u0100\uffff",
    354                 u"\x00\xff\u0100\uffff",
    355                 u"\x00\xff\u0100\uffff",
    356                 u"\x00\xff\u0100\uffff",
    357                 u"\x00\xff\u0100\uffff\U00010000",
    358             ]
    359         )
    360 
    361     def test_handlers(self):
    362         self.assertEqual((u'\ufffd', 1),
    363                          codecs.utf_32_decode('\x01', 'replace', True))
    364         self.assertEqual((u'', 1),
    365                          codecs.utf_32_decode('\x01', 'ignore', True))
    366 
    367     def test_errors(self):
    368         self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
    369                           "\xff", "strict", True)
    370 
    371     def test_issue8941(self):
    372         # Issue #8941: insufficient result allocation when decoding into
    373         # surrogate pairs on UCS-2 builds.
    374         encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
    375         self.assertEqual(u'\U00010000' * 1024,
    376                          codecs.utf_32_decode(encoded_le)[0])
    377         encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
    378         self.assertEqual(u'\U00010000' * 1024,
    379                          codecs.utf_32_decode(encoded_be)[0])
    380 
    381 class UTF32LETest(ReadTest):
    382     encoding = "utf-32-le"
    383 
    384     def test_partial(self):
    385         self.check_partial(
    386             u"\x00\xff\u0100\uffff\U00010000",
    387             [
    388                 u"",
    389                 u"",
    390                 u"",
    391                 u"\x00",
    392                 u"\x00",
    393                 u"\x00",
    394                 u"\x00",
    395                 u"\x00\xff",
    396                 u"\x00\xff",
    397                 u"\x00\xff",
    398                 u"\x00\xff",
    399                 u"\x00\xff\u0100",
    400                 u"\x00\xff\u0100",
    401                 u"\x00\xff\u0100",
    402                 u"\x00\xff\u0100",
    403                 u"\x00\xff\u0100\uffff",
    404                 u"\x00\xff\u0100\uffff",
    405                 u"\x00\xff\u0100\uffff",
    406                 u"\x00\xff\u0100\uffff",
    407                 u"\x00\xff\u0100\uffff\U00010000",
    408             ]
    409         )
    410 
    411     def test_simple(self):
    412         self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
    413 
    414     def test_errors(self):
    415         self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
    416                           "\xff", "strict", True)
    417 
    418     def test_issue8941(self):
    419         # Issue #8941: insufficient result allocation when decoding into
    420         # surrogate pairs on UCS-2 builds.
    421         encoded = '\x00\x00\x01\x00' * 1024
    422         self.assertEqual(u'\U00010000' * 1024,
    423                          codecs.utf_32_le_decode(encoded)[0])
    424 
    425 class UTF32BETest(ReadTest):
    426     encoding = "utf-32-be"
    427 
    428     def test_partial(self):
    429         self.check_partial(
    430             u"\x00\xff\u0100\uffff\U00010000",
    431             [
    432                 u"",
    433                 u"",
    434                 u"",
    435                 u"\x00",
    436                 u"\x00",
    437                 u"\x00",
    438                 u"\x00",
    439                 u"\x00\xff",
    440                 u"\x00\xff",
    441                 u"\x00\xff",
    442                 u"\x00\xff",
    443                 u"\x00\xff\u0100",
    444                 u"\x00\xff\u0100",
    445                 u"\x00\xff\u0100",
    446                 u"\x00\xff\u0100",
    447                 u"\x00\xff\u0100\uffff",
    448                 u"\x00\xff\u0100\uffff",
    449                 u"\x00\xff\u0100\uffff",
    450                 u"\x00\xff\u0100\uffff",
    451                 u"\x00\xff\u0100\uffff\U00010000",
    452             ]
    453         )
    454 
    455     def test_simple(self):
    456         self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
    457 
    458     def test_errors(self):
    459         self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
    460                           "\xff", "strict", True)
    461 
    462     def test_issue8941(self):
    463         # Issue #8941: insufficient result allocation when decoding into
    464         # surrogate pairs on UCS-2 builds.
    465         encoded = '\x00\x01\x00\x00' * 1024
    466         self.assertEqual(u'\U00010000' * 1024,
    467                          codecs.utf_32_be_decode(encoded)[0])
    468 
    469 
    470 class UTF16Test(ReadTest):
    471     encoding = "utf-16"
    472 
    473     spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    474     spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
    475 
    476     def test_only_one_bom(self):
    477         _,_,reader,writer = codecs.lookup(self.encoding)
    478         # encode some stream
    479         s = StringIO.StringIO()
    480         f = writer(s)
    481         f.write(u"spam")
    482         f.write(u"spam")
    483         d = s.getvalue()
    484         # check whether there is exactly one BOM in it
    485         self.assertTrue(d == self.spamle or d == self.spambe)
    486         # try to read it back
    487         s = StringIO.StringIO(d)
    488         f = reader(s)
    489         self.assertEqual(f.read(), u"spamspam")
    490 
    491     def test_badbom(self):
    492         s = StringIO.StringIO("\xff\xff")
    493         f = codecs.getreader(self.encoding)(s)
    494         self.assertRaises(UnicodeError, f.read)
    495 
    496         s = StringIO.StringIO("\xff\xff\xff\xff")
    497         f = codecs.getreader(self.encoding)(s)
    498         self.assertRaises(UnicodeError, f.read)
    499 
    500     def test_partial(self):
    501         self.check_partial(
    502             u"\x00\xff\u0100\uffff\U00010000",
    503             [
    504                 u"", # first byte of BOM read
    505                 u"", # second byte of BOM read => byteorder known
    506                 u"",
    507                 u"\x00",
    508                 u"\x00",
    509                 u"\x00\xff",
    510                 u"\x00\xff",
    511                 u"\x00\xff\u0100",
    512                 u"\x00\xff\u0100",
    513                 u"\x00\xff\u0100\uffff",
    514                 u"\x00\xff\u0100\uffff",
    515                 u"\x00\xff\u0100\uffff",
    516                 u"\x00\xff\u0100\uffff",
    517                 u"\x00\xff\u0100\uffff\U00010000",
    518             ]
    519         )
    520 
    521     def test_handlers(self):
    522         self.assertEqual((u'\ufffd', 1),
    523                          codecs.utf_16_decode('\x01', 'replace', True))
    524         self.assertEqual((u'', 1),
    525                          codecs.utf_16_decode('\x01', 'ignore', True))
    526 
    527     def test_errors(self):
    528         self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
    529 
    530     def test_bug691291(self):
    531         # Files are always opened in binary mode, even if no binary mode was
    532         # specified.  This means that no automatic conversion of '\n' is done
    533         # on reading and writing.
    534         s1 = u'Hello\r\nworld\r\n'
    535 
    536         s = s1.encode(self.encoding)
    537         self.addCleanup(test_support.unlink, test_support.TESTFN)
    538         with open(test_support.TESTFN, 'wb') as fp:
    539             fp.write(s)
    540         with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
    541             self.assertEqual(reader.read(), s1)
    542 
    543 class UTF16LETest(ReadTest):
    544     encoding = "utf-16-le"
    545 
    546     def test_partial(self):
    547         self.check_partial(
    548             u"\x00\xff\u0100\uffff\U00010000",
    549             [
    550                 u"",
    551                 u"\x00",
    552                 u"\x00",
    553                 u"\x00\xff",
    554                 u"\x00\xff",
    555                 u"\x00\xff\u0100",
    556                 u"\x00\xff\u0100",
    557                 u"\x00\xff\u0100\uffff",
    558                 u"\x00\xff\u0100\uffff",
    559                 u"\x00\xff\u0100\uffff",
    560                 u"\x00\xff\u0100\uffff",
    561                 u"\x00\xff\u0100\uffff\U00010000",
    562             ]
    563         )
    564 
    565     def test_errors(self):
    566         tests = [
    567             (b'\xff', u'\ufffd'),
    568             (b'A\x00Z', u'A\ufffd'),
    569             (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
    570             (b'\x00\xd8', u'\ufffd'),
    571             (b'\x00\xd8A', u'\ufffd'),
    572             (b'\x00\xd8A\x00', u'\ufffdA'),
    573             (b'\x00\xdcA\x00', u'\ufffdA'),
    574         ]
    575         for raw, expected in tests:
    576             try:
    577                 with self.assertRaises(UnicodeDecodeError):
    578                     codecs.utf_16_le_decode(raw, 'strict', True)
    579                 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
    580             except:
    581                 print 'raw=%r' % raw
    582                 raise
    583 
    584 class UTF16BETest(ReadTest):
    585     encoding = "utf-16-be"
    586 
    587     def test_partial(self):
    588         self.check_partial(
    589             u"\x00\xff\u0100\uffff\U00010000",
    590             [
    591                 u"",
    592                 u"\x00",
    593                 u"\x00",
    594                 u"\x00\xff",
    595                 u"\x00\xff",
    596                 u"\x00\xff\u0100",
    597                 u"\x00\xff\u0100",
    598                 u"\x00\xff\u0100\uffff",
    599                 u"\x00\xff\u0100\uffff",
    600                 u"\x00\xff\u0100\uffff",
    601                 u"\x00\xff\u0100\uffff",
    602                 u"\x00\xff\u0100\uffff\U00010000",
    603             ]
    604         )
    605 
    606     def test_errors(self):
    607         tests = [
    608             (b'\xff', u'\ufffd'),
    609             (b'\x00A\xff', u'A\ufffd'),
    610             (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
    611             (b'\xd8\x00', u'\ufffd'),
    612             (b'\xd8\x00\xdc', u'\ufffd'),
    613             (b'\xd8\x00\x00A', u'\ufffdA'),
    614             (b'\xdc\x00\x00A', u'\ufffdA'),
    615         ]
    616         for raw, expected in tests:
    617             try:
    618                 with self.assertRaises(UnicodeDecodeError):
    619                     codecs.utf_16_be_decode(raw, 'strict', True)
    620                 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
    621             except:
    622                 print 'raw=%r' % raw
    623                 raise
    624 
    625 class UTF8Test(ReadTest):
    626     encoding = "utf-8"
    627 
    628     def test_partial(self):
    629         self.check_partial(
    630             u"\x00\xff\u07ff\u0800\uffff\U00010000",
    631             [
    632                 u"\x00",
    633                 u"\x00",
    634                 u"\x00\xff",
    635                 u"\x00\xff",
    636                 u"\x00\xff\u07ff",
    637                 u"\x00\xff\u07ff",
    638                 u"\x00\xff\u07ff",
    639                 u"\x00\xff\u07ff\u0800",
    640                 u"\x00\xff\u07ff\u0800",
    641                 u"\x00\xff\u07ff\u0800",
    642                 u"\x00\xff\u07ff\u0800\uffff",
    643                 u"\x00\xff\u07ff\u0800\uffff",
    644                 u"\x00\xff\u07ff\u0800\uffff",
    645                 u"\x00\xff\u07ff\u0800\uffff",
    646                 u"\x00\xff\u07ff\u0800\uffff\U00010000",
    647             ]
    648         )
    649 
    650 class UTF7Test(ReadTest):
    651     encoding = "utf-7"
    652 
    653     def test_ascii(self):
    654         # Set D (directly encoded characters)
    655         set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    656                  'abcdefghijklmnopqrstuvwxyz'
    657                  '0123456789'
    658                  '\'(),-./:?')
    659         self.assertEqual(set_d.encode(self.encoding), set_d)
    660         self.assertEqual(set_d.decode(self.encoding), set_d)
    661         # Set O (optional direct characters)
    662         set_o = ' !"#$%&*;<=>@[]^_`{|}'
    663         self.assertEqual(set_o.encode(self.encoding), set_o)
    664         self.assertEqual(set_o.decode(self.encoding), set_o)
    665         # +
    666         self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
    667         self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
    668         # White spaces
    669         ws = ' \t\n\r'
    670         self.assertEqual(ws.encode(self.encoding), ws)
    671         self.assertEqual(ws.decode(self.encoding), ws)
    672         # Other ASCII characters
    673         other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
    674                                      set(set_d + set_o + '+' + ws)))
    675         self.assertEqual(other_ascii.encode(self.encoding),
    676                          '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
    677                          'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
    678 
    679     def test_partial(self):
    680         self.check_partial(
    681             u"a+-b",
    682             [
    683                 u"a",
    684                 u"a",
    685                 u"a+",
    686                 u"a+-",
    687                 u"a+-b",
    688             ]
    689         )
    690 
    691     def test_errors(self):
    692         tests = [
    693             ('\xe1b', u'\ufffdb'),
    694             ('a\xe1b', u'a\ufffdb'),
    695             ('a\xe1\xe1b', u'a\ufffd\ufffdb'),
    696             ('a+IK', u'a\ufffd'),
    697             ('a+IK-b', u'a\ufffdb'),
    698             ('a+IK,b', u'a\ufffdb'),
    699             ('a+IKx', u'a\u20ac\ufffd'),
    700             ('a+IKx-b', u'a\u20ac\ufffdb'),
    701             ('a+IKwgr', u'a\u20ac\ufffd'),
    702             ('a+IKwgr-b', u'a\u20ac\ufffdb'),
    703             ('a+IKwgr,', u'a\u20ac\ufffd'),
    704             ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
    705             ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
    706             ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
    707             ('a+/,+IKw-b', u'a\ufffd\u20acb'),
    708             ('a+//,+IKw-b', u'a\ufffd\u20acb'),
    709             ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
    710             ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
    711             ('a+IKw-b\xe1', u'a\u20acb\ufffd'),
    712             ('a+IKw\xe1b', u'a\u20ac\ufffdb'),
    713         ]
    714         for raw, expected in tests:
    715             try:
    716                 with self.assertRaises(UnicodeDecodeError):
    717                     codecs.utf_7_decode(raw, 'strict', True)
    718                 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
    719             except:
    720                 print 'raw=%r' % raw
    721                 raise
    722 
    723     def test_nonbmp(self):
    724         self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
    725         self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
    726         self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
    727         self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
    728         self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
    729         self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
    730         self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
    731         self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
    732                          '+IKwgrNgB3KA-')
    733         self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
    734                          u'\u20ac\u20ac\U000104A0')
    735         self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
    736                          u'\u20ac\u20ac\U000104A0')
    737 
    738     def test_lone_surrogates(self):
    739         tests = [
    740             ('a+2AE-b', u'a\ud801b'),
    741             ('a+2AE\xe1b', u'a\ufffdb'),
    742             ('a+2AE', u'a\ufffd'),
    743             ('a+2AEA-b', u'a\ufffdb'),
    744             ('a+2AH-b', u'a\ufffdb'),
    745             ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
    746             ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'),
    747             ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
    748             ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
    749             ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
    750             ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'),
    751             ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
    752             ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
    753         ]
    754         for raw, expected in tests:
    755             try:
    756                 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
    757             except:
    758                 print 'raw=%r' % raw
    759                 raise
    760 
    761 class UTF16ExTest(unittest.TestCase):
    762 
    763     def test_errors(self):
    764         self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
    765 
    766     def test_bad_args(self):
    767         self.assertRaises(TypeError, codecs.utf_16_ex_decode)
    768 
    769 class ReadBufferTest(unittest.TestCase):
    770 
    771     def test_array(self):
    772         import array
    773         self.assertEqual(
    774             codecs.readbuffer_encode(array.array("c", "spam")),
    775             ("spam", 4)
    776         )
    777 
    778     def test_empty(self):
    779         self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
    780 
    781     def test_bad_args(self):
    782         self.assertRaises(TypeError, codecs.readbuffer_encode)
    783         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
    784 
    785 class CharBufferTest(unittest.TestCase):
    786 
    787     def test_string(self):
    788         self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
    789 
    790     def test_empty(self):
    791         self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
    792 
    793     def test_bad_args(self):
    794         self.assertRaises(TypeError, codecs.charbuffer_encode)
    795         self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
    796 
    797 class UTF8SigTest(ReadTest):
    798     encoding = "utf-8-sig"
    799 
    800     def test_partial(self):
    801         self.check_partial(
    802             u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
    803             [
    804                 u"",
    805                 u"",
    806                 u"", # First BOM has been read and skipped
    807                 u"",
    808                 u"",
    809                 u"\ufeff", # Second BOM has been read and emitted
    810                 u"\ufeff\x00", # "\x00" read and emitted
    811                 u"\ufeff\x00", # First byte of encoded u"\xff" read
    812                 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
    813                 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
    814                 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
    815                 u"\ufeff\x00\xff\u07ff",
    816                 u"\ufeff\x00\xff\u07ff",
    817                 u"\ufeff\x00\xff\u07ff\u0800",
    818                 u"\ufeff\x00\xff\u07ff\u0800",
    819                 u"\ufeff\x00\xff\u07ff\u0800",
    820                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    821                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    822                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    823                 u"\ufeff\x00\xff\u07ff\u0800\uffff",
    824                 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
    825             ]
    826         )
    827 
    828     def test_bug1601501(self):
    829         # SF bug #1601501: check that the codec works with a buffer
    830         unicode("\xef\xbb\xbf", "utf-8-sig")
    831 
    832     def test_bom(self):
    833         d = codecs.getincrementaldecoder("utf-8-sig")()
    834         s = u"spam"
    835         self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
    836 
    837     def test_stream_bom(self):
    838         unistring = u"ABC\u00A1\u2200XYZ"
    839         bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
    840 
    841         reader = codecs.getreader("utf-8-sig")
    842         for sizehint in [None] + range(1, 11) + \
    843                         [64, 128, 256, 512, 1024]:
    844             istream = reader(StringIO.StringIO(bytestring))
    845             ostream = StringIO.StringIO()
    846             while 1:
    847                 if sizehint is not None:
    848                     data = istream.read(sizehint)
    849                 else:
    850                     data = istream.read()
    851 
    852                 if not data:
    853                     break
    854                 ostream.write(data)
    855 
    856             got = ostream.getvalue()
    857             self.assertEqual(got, unistring)
    858 
    859     def test_stream_bare(self):
    860         unistring = u"ABC\u00A1\u2200XYZ"
    861         bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
    862 
    863         reader = codecs.getreader("utf-8-sig")
    864         for sizehint in [None] + range(1, 11) + \
    865                         [64, 128, 256, 512, 1024]:
    866             istream = reader(StringIO.StringIO(bytestring))
    867             ostream = StringIO.StringIO()
    868             while 1:
    869                 if sizehint is not None:
    870                     data = istream.read(sizehint)
    871                 else:
    872                     data = istream.read()
    873 
    874                 if not data:
    875                     break
    876                 ostream.write(data)
    877 
    878             got = ostream.getvalue()
    879             self.assertEqual(got, unistring)
    880 
    881 class EscapeDecodeTest(unittest.TestCase):
    882     def test_empty(self):
    883         self.assertEqual(codecs.escape_decode(""), ("", 0))
    884 
    885     def test_raw(self):
    886         decode = codecs.escape_decode
    887         for b in range(256):
    888             b = chr(b)
    889             if b != '\\':
    890                 self.assertEqual(decode(b + '0'), (b + '0', 2))
    891 
    892     def test_escape(self):
    893         decode = codecs.escape_decode
    894         check = coding_checker(self, decode)
    895         check(b"[\\\n]", b"[]")
    896         check(br'[\"]', b'["]')
    897         check(br"[\']", b"[']")
    898         check(br"[\\]", br"[\]")
    899         check(br"[\a]", b"[\x07]")
    900         check(br"[\b]", b"[\x08]")
    901         check(br"[\t]", b"[\x09]")
    902         check(br"[\n]", b"[\x0a]")
    903         check(br"[\v]", b"[\x0b]")
    904         check(br"[\f]", b"[\x0c]")
    905         check(br"[\r]", b"[\x0d]")
    906         check(br"[\7]", b"[\x07]")
    907         check(br"[\8]", br"[\8]")
    908         check(br"[\78]", b"[\x078]")
    909         check(br"[\41]", b"[!]")
    910         check(br"[\418]", b"[!8]")
    911         check(br"[\101]", b"[A]")
    912         check(br"[\1010]", b"[A0]")
    913         check(br"[\501]", b"[A]")
    914         check(br"[\x41]", b"[A]")
    915         check(br"[\X41]", br"[\X41]")
    916         check(br"[\x410]", b"[A0]")
    917         for b in range(256):
    918             b = chr(b)
    919             if b not in '\n"\'\\abtnvfr01234567x':
    920                 check('\\' + b, '\\' + b)
    921 
    922     def test_errors(self):
    923         decode = codecs.escape_decode
    924         self.assertRaises(ValueError, decode, br"\x")
    925         self.assertRaises(ValueError, decode, br"[\x]")
    926         self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
    927         self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
    928         self.assertRaises(ValueError, decode, br"\x0")
    929         self.assertRaises(ValueError, decode, br"[\x0]")
    930         self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
    931         self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
    932 
    933 class RecodingTest(unittest.TestCase):
    934     def test_recoding(self):
    935         f = StringIO.StringIO()
    936         f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
    937         f2.write(u"a")
    938         f2.close()
    939         # Python used to crash on this at exit because of a refcount
    940         # bug in _codecsmodule.c
    941 
    942 # From RFC 3492
    943 punycode_testcases = [
    944     # A Arabic (Egyptian):
    945     (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
    946      u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
    947      "egbpdaj6bu4bxfgehfvwxn"),
    948     # B Chinese (simplified):
    949     (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
    950      "ihqwcrb4cv8a8dqg056pqjye"),
    951     # C Chinese (traditional):
    952     (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
    953      "ihqwctvzc91f659drss3x8bo0yb"),
    954     # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
    955     (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
    956      u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
    957      u"\u0065\u0073\u006B\u0079",
    958      "Proprostnemluvesky-uyb24dma41a"),
    959     # E Hebrew:
    960     (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
    961      u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
    962      u"\u05D1\u05E8\u05D9\u05EA",
    963      "4dbcagdahymbxekheh6e0a7fei0b"),
    964     # F Hindi (Devanagari):
    965     (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
    966     u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
    967     u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
    968     u"\u0939\u0948\u0902",
    969     "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
    970 
    971     #(G) Japanese (kanji and hiragana):
    972     (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
    973     u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
    974      "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
    975 
    976     # (H) Korean (Hangul syllables):
    977     (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
    978      u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
    979      u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
    980      "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
    981      "psd879ccm6fea98c"),
    982 
    983     # (I) Russian (Cyrillic):
    984     (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
    985      u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
    986      u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
    987      u"\u0438",
    988      "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
    989 
    990     # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
    991     (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
    992      u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
    993      u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
    994      u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
    995      u"\u0061\u00F1\u006F\u006C",
    996      "PorqunopuedensimplementehablarenEspaol-fmd56a"),
    997 
    998     # (K) Vietnamese:
    999     #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
   1000     #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
   1001     (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
   1002      u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
   1003      u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
   1004      u"\u0056\u0069\u1EC7\u0074",
   1005      "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
   1006 
   1007     #(L) 3<nen>B<gumi><kinpachi><sensei>
   1008     (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
   1009      "3B-ww4c5e180e575a65lsy2b"),
   1010 
   1011     # (M) <amuro><namie>-with-SUPER-MONKEYS
   1012     (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
   1013      u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
   1014      u"\u004F\u004E\u004B\u0045\u0059\u0053",
   1015      "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
   1016 
   1017     # (N) Hello-Another-Way-<sorezore><no><basho>
   1018     (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
   1019      u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
   1020      u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
   1021      "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
   1022 
   1023     # (O) <hitotsu><yane><no><shita>2
   1024     (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
   1025      "2-u9tlzr9756bt3uc0v"),
   1026 
   1027     # (P) Maji<de>Koi<suru>5<byou><mae>
   1028     (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
   1029      u"\u308B\u0035\u79D2\u524D",
   1030      "MajiKoi5-783gue6qz075azm5e"),
   1031 
   1032      # (Q) <pafii>de<runba>
   1033     (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
   1034      "de-jg4avhby1noc0d"),
   1035 
   1036     # (R) <sono><supiido><de>
   1037     (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
   1038      "d9juau41awczczp"),
   1039 
   1040     # (S) -> $1.00 <-
   1041     (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
   1042      u"\u003C\u002D",
   1043      "-> $1.00 <--")
   1044     ]
   1045 
   1046 for i in punycode_testcases:
   1047     if len(i)!=2:
   1048         print repr(i)
   1049 
   1050 class PunycodeTest(unittest.TestCase):
   1051     def test_encode(self):
   1052         for uni, puny in punycode_testcases:
   1053             # Need to convert both strings to lower case, since
   1054             # some of the extended encodings use upper case, but our
   1055             # code produces only lower case. Converting just puny to
   1056             # lower is also insufficient, since some of the input characters
   1057             # are upper case.
   1058             self.assertEqual(uni.encode("punycode").lower(), puny.lower())
   1059 
   1060     def test_decode(self):
   1061         for uni, puny in punycode_testcases:
   1062             self.assertEqual(uni, puny.decode("punycode"))
   1063 
   1064 class UnicodeInternalTest(unittest.TestCase):
   1065     def test_bug1251300(self):
   1066         # Decoding with unicode_internal used to not correctly handle "code
   1067         # points" above 0x10ffff on UCS-4 builds.
   1068         if sys.maxunicode > 0xffff:
   1069             ok = [
   1070                 ("\x00\x10\xff\xff", u"\U0010ffff"),
   1071                 ("\x00\x00\x01\x01", u"\U00000101"),
   1072                 ("", u""),
   1073             ]
   1074             not_ok = [
   1075                 "\x7f\xff\xff\xff",
   1076                 "\x80\x00\x00\x00",
   1077                 "\x81\x00\x00\x00",
   1078                 "\x00",
   1079                 "\x00\x00\x00\x00\x00",
   1080             ]
   1081             for internal, uni in ok:
   1082                 if sys.byteorder == "little":
   1083                     internal = "".join(reversed(internal))
   1084                 self.assertEqual(uni, internal.decode("unicode_internal"))
   1085             for internal in not_ok:
   1086                 if sys.byteorder == "little":
   1087                     internal = "".join(reversed(internal))
   1088                 self.assertRaises(UnicodeDecodeError, internal.decode,
   1089                     "unicode_internal")
   1090 
   1091     def test_decode_error_attributes(self):
   1092         if sys.maxunicode > 0xffff:
   1093             try:
   1094                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
   1095             except UnicodeDecodeError, ex:
   1096                 self.assertEqual("unicode_internal", ex.encoding)
   1097                 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
   1098                 self.assertEqual(4, ex.start)
   1099                 self.assertEqual(8, ex.end)
   1100             else:
   1101                 self.fail()
   1102 
   1103     def test_decode_callback(self):
   1104         if sys.maxunicode > 0xffff:
   1105             codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
   1106             decoder = codecs.getdecoder("unicode_internal")
   1107             ab = u"ab".encode("unicode_internal")
   1108             ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
   1109                 "UnicodeInternalTest")
   1110             self.assertEqual((u"ab", 12), ignored)
   1111 
   1112     def test_encode_length(self):
   1113         # Issue 3739
   1114         encoder = codecs.getencoder("unicode_internal")
   1115         self.assertEqual(encoder(u"a")[1], 1)
   1116         self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
   1117 
   1118         encoder = codecs.getencoder("string-escape")
   1119         self.assertEqual(encoder(r'\x00')[1], 4)
   1120 
   1121 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
   1122 nameprep_tests = [
   1123     # 3.1 Map to nothing.
   1124     ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
   1125      '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
   1126      '\xb8\x8f\xef\xbb\xbf',
   1127      'foobarbaz'),
   1128     # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
   1129     ('CAFE',
   1130      'cafe'),
   1131     # 3.3 Case folding 8bit U+00DF (german sharp s).
   1132     # The original test case is bogus; it says \xc3\xdf
   1133     ('\xc3\x9f',
   1134      'ss'),
   1135     # 3.4 Case folding U+0130 (turkish capital I with dot).
   1136     ('\xc4\xb0',
   1137      'i\xcc\x87'),
   1138     # 3.5 Case folding multibyte U+0143 U+037A.
   1139     ('\xc5\x83\xcd\xba',
   1140      '\xc5\x84 \xce\xb9'),
   1141     # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
   1142     # XXX: skip this as it fails in UCS-2 mode
   1143     #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
   1144     # 'telc\xe2\x88\x95kg\xcf\x83'),
   1145     (None, None),
   1146     # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
   1147     ('j\xcc\x8c\xc2\xa0\xc2\xaa',
   1148      '\xc7\xb0 a'),
   1149     # 3.8 Case folding U+1FB7 and normalization.
   1150     ('\xe1\xbe\xb7',
   1151      '\xe1\xbe\xb6\xce\xb9'),
   1152     # 3.9 Self-reverting case folding U+01F0 and normalization.
   1153     # The original test case is bogus, it says `\xc7\xf0'
   1154     ('\xc7\xb0',
   1155      '\xc7\xb0'),
   1156     # 3.10 Self-reverting case folding U+0390 and normalization.
   1157     ('\xce\x90',
   1158      '\xce\x90'),
   1159     # 3.11 Self-reverting case folding U+03B0 and normalization.
   1160     ('\xce\xb0',
   1161      '\xce\xb0'),
   1162     # 3.12 Self-reverting case folding U+1E96 and normalization.
   1163     ('\xe1\xba\x96',
   1164      '\xe1\xba\x96'),
   1165     # 3.13 Self-reverting case folding U+1F56 and normalization.
   1166     ('\xe1\xbd\x96',
   1167      '\xe1\xbd\x96'),
   1168     # 3.14 ASCII space character U+0020.
   1169     (' ',
   1170      ' '),
   1171     # 3.15 Non-ASCII 8bit space character U+00A0.
   1172     ('\xc2\xa0',
   1173      ' '),
   1174     # 3.16 Non-ASCII multibyte space character U+1680.
   1175     ('\xe1\x9a\x80',
   1176      None),
   1177     # 3.17 Non-ASCII multibyte space character U+2000.
   1178     ('\xe2\x80\x80',
   1179      ' '),
   1180     # 3.18 Zero Width Space U+200b.
   1181     ('\xe2\x80\x8b',
   1182      ''),
   1183     # 3.19 Non-ASCII multibyte space character U+3000.
   1184     ('\xe3\x80\x80',
   1185      ' '),
   1186     # 3.20 ASCII control characters U+0010 U+007F.
   1187     ('\x10\x7f',
   1188      '\x10\x7f'),
   1189     # 3.21 Non-ASCII 8bit control character U+0085.
   1190     ('\xc2\x85',
   1191      None),
   1192     # 3.22 Non-ASCII multibyte control character U+180E.
   1193     ('\xe1\xa0\x8e',
   1194      None),
   1195     # 3.23 Zero Width No-Break Space U+FEFF.
   1196     ('\xef\xbb\xbf',
   1197      ''),
   1198     # 3.24 Non-ASCII control character U+1D175.
   1199     ('\xf0\x9d\x85\xb5',
   1200      None),
   1201     # 3.25 Plane 0 private use character U+F123.
   1202     ('\xef\x84\xa3',
   1203      None),
   1204     # 3.26 Plane 15 private use character U+F1234.
   1205     ('\xf3\xb1\x88\xb4',
   1206      None),
   1207     # 3.27 Plane 16 private use character U+10F234.
   1208     ('\xf4\x8f\x88\xb4',
   1209      None),
   1210     # 3.28 Non-character code point U+8FFFE.
   1211     ('\xf2\x8f\xbf\xbe',
   1212      None),
   1213     # 3.29 Non-character code point U+10FFFF.
   1214     ('\xf4\x8f\xbf\xbf',
   1215      None),
   1216     # 3.30 Surrogate code U+DF42.
   1217     ('\xed\xbd\x82',
   1218      None),
   1219     # 3.31 Non-plain text character U+FFFD.
   1220     ('\xef\xbf\xbd',
   1221      None),
   1222     # 3.32 Ideographic description character U+2FF5.
   1223     ('\xe2\xbf\xb5',
   1224      None),
   1225     # 3.33 Display property character U+0341.
   1226     ('\xcd\x81',
   1227      '\xcc\x81'),
   1228     # 3.34 Left-to-right mark U+200E.
   1229     ('\xe2\x80\x8e',
   1230      None),
   1231     # 3.35 Deprecated U+202A.
   1232     ('\xe2\x80\xaa',
   1233      None),
   1234     # 3.36 Language tagging character U+E0001.
   1235     ('\xf3\xa0\x80\x81',
   1236      None),
   1237     # 3.37 Language tagging character U+E0042.
   1238     ('\xf3\xa0\x81\x82',
   1239      None),
   1240     # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
   1241     ('foo\xd6\xbebar',
   1242      None),
   1243     # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
   1244     ('foo\xef\xb5\x90bar',
   1245      None),
   1246     # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
   1247     ('foo\xef\xb9\xb6bar',
   1248      'foo \xd9\x8ebar'),
   1249     # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
   1250     ('\xd8\xa71',
   1251      None),
   1252     # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
   1253     ('\xd8\xa71\xd8\xa8',
   1254      '\xd8\xa71\xd8\xa8'),
   1255     # 3.43 Unassigned code point U+E0002.
   1256     # Skip this test as we allow unassigned
   1257     #('\xf3\xa0\x80\x82',
   1258     # None),
   1259     (None, None),
   1260     # 3.44 Larger test (shrinking).
   1261     # Original test case reads \xc3\xdf
   1262     ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
   1263      '\xaa\xce\xb0\xe2\x80\x80',
   1264      'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
   1265     # 3.45 Larger test (expanding).
   1266     # Original test case reads \xc3\x9f
   1267     ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
   1268      '\x80',
   1269      'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
   1270      '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
   1271      '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
   1272     ]
   1273 
   1274 
   1275 class NameprepTest(unittest.TestCase):
   1276     def test_nameprep(self):
   1277         from encodings.idna import nameprep
   1278         for pos, (orig, prepped) in enumerate(nameprep_tests):
   1279             if orig is None:
   1280                 # Skipped
   1281                 continue
   1282             # The Unicode strings are given in UTF-8
   1283             orig = unicode(orig, "utf-8")
   1284             if prepped is None:
   1285                 # Input contains prohibited characters
   1286                 self.assertRaises(UnicodeError, nameprep, orig)
   1287             else:
   1288                 prepped = unicode(prepped, "utf-8")
   1289                 try:
   1290                     self.assertEqual(nameprep(orig), prepped)
   1291                 except Exception,e:
   1292                     raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
   1293 
   1294 class IDNACodecTest(unittest.TestCase):
   1295     def test_builtin_decode(self):
   1296         self.assertEqual(unicode("python.org", "idna"), u"python.org")
   1297         self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
   1298         self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
   1299         self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
   1300 
   1301     def test_builtin_encode(self):
   1302         self.assertEqual(u"python.org".encode("idna"), "python.org")
   1303         self.assertEqual("python.org.".encode("idna"), "python.org.")
   1304         self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
   1305         self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
   1306 
   1307     def test_stream(self):
   1308         import StringIO
   1309         r = codecs.getreader("idna")(StringIO.StringIO("abc"))
   1310         r.read(3)
   1311         self.assertEqual(r.read(), u"")
   1312 
   1313     def test_incremental_decode(self):
   1314         self.assertEqual(
   1315             "".join(codecs.iterdecode("python.org", "idna")),
   1316             u"python.org"
   1317         )
   1318         self.assertEqual(
   1319             "".join(codecs.iterdecode("python.org.", "idna")),
   1320             u"python.org."
   1321         )
   1322         self.assertEqual(
   1323             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
   1324             u"pyth\xf6n.org."
   1325         )
   1326         self.assertEqual(
   1327             "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
   1328             u"pyth\xf6n.org."
   1329         )
   1330 
   1331         decoder = codecs.getincrementaldecoder("idna")()
   1332         self.assertEqual(decoder.decode("xn--xam", ), u"")
   1333         self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
   1334         self.assertEqual(decoder.decode(u"rg"), u"")
   1335         self.assertEqual(decoder.decode(u"", True), u"org")
   1336 
   1337         decoder.reset()
   1338         self.assertEqual(decoder.decode("xn--xam", ), u"")
   1339         self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
   1340         self.assertEqual(decoder.decode("rg."), u"org.")
   1341         self.assertEqual(decoder.decode("", True), u"")
   1342 
   1343     def test_incremental_encode(self):
   1344         self.assertEqual(
   1345             "".join(codecs.iterencode(u"python.org", "idna")),
   1346             "python.org"
   1347         )
   1348         self.assertEqual(
   1349             "".join(codecs.iterencode(u"python.org.", "idna")),
   1350             "python.org."
   1351         )
   1352         self.assertEqual(
   1353             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
   1354             "xn--pythn-mua.org."
   1355         )
   1356         self.assertEqual(
   1357             "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
   1358             "xn--pythn-mua.org."
   1359         )
   1360 
   1361         encoder = codecs.getincrementalencoder("idna")()
   1362         self.assertEqual(encoder.encode(u"\xe4x"), "")
   1363         self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
   1364         self.assertEqual(encoder.encode(u"", True), "org")
   1365 
   1366         encoder.reset()
   1367         self.assertEqual(encoder.encode(u"\xe4x"), "")
   1368         self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
   1369         self.assertEqual(encoder.encode(u"", True), "")
   1370 
   1371 class CodecsModuleTest(unittest.TestCase):
   1372 
   1373     def test_decode(self):
   1374         self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
   1375                           u'\xe4\xf6\xfc')
   1376         self.assertRaises(TypeError, codecs.decode)
   1377         self.assertEqual(codecs.decode('abc'), u'abc')
   1378         self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
   1379 
   1380     def test_encode(self):
   1381         self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
   1382                           '\xe4\xf6\xfc')
   1383         self.assertRaises(TypeError, codecs.encode)
   1384         self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
   1385         self.assertEqual(codecs.encode(u'abc'), 'abc')
   1386         self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
   1387 
   1388     def test_register(self):
   1389         self.assertRaises(TypeError, codecs.register)
   1390         self.assertRaises(TypeError, codecs.register, 42)
   1391 
   1392     def test_lookup(self):
   1393         self.assertRaises(TypeError, codecs.lookup)
   1394         self.assertRaises(LookupError, codecs.lookup, "__spam__")
   1395         self.assertRaises(LookupError, codecs.lookup, " ")
   1396 
   1397     def test_getencoder(self):
   1398         self.assertRaises(TypeError, codecs.getencoder)
   1399         self.assertRaises(LookupError, codecs.getencoder, "__spam__")
   1400 
   1401     def test_getdecoder(self):
   1402         self.assertRaises(TypeError, codecs.getdecoder)
   1403         self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
   1404 
   1405     def test_getreader(self):
   1406         self.assertRaises(TypeError, codecs.getreader)
   1407         self.assertRaises(LookupError, codecs.getreader, "__spam__")
   1408 
   1409     def test_getwriter(self):
   1410         self.assertRaises(TypeError, codecs.getwriter)
   1411         self.assertRaises(LookupError, codecs.getwriter, "__spam__")
   1412 
   1413     def test_lookup_issue1813(self):
   1414         # Issue #1813: under Turkish locales, lookup of some codecs failed
   1415         # because 'I' is lowercased as a dotless "i"
   1416         oldlocale = locale.getlocale(locale.LC_CTYPE)
   1417         self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
   1418         try:
   1419             locale.setlocale(locale.LC_CTYPE, 'tr_TR')
   1420         except locale.Error:
   1421             # Unsupported locale on this system
   1422             self.skipTest('test needs Turkish locale')
   1423         c = codecs.lookup('ASCII')
   1424         self.assertEqual(c.name, 'ascii')
   1425 
   1426     def test_all(self):
   1427         api = (
   1428             "encode", "decode",
   1429             "register", "CodecInfo", "Codec", "IncrementalEncoder",
   1430             "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
   1431             "getencoder", "getdecoder", "getincrementalencoder",
   1432             "getincrementaldecoder", "getreader", "getwriter",
   1433             "register_error", "lookup_error",
   1434             "strict_errors", "replace_errors", "ignore_errors",
   1435             "xmlcharrefreplace_errors", "backslashreplace_errors",
   1436             "open", "EncodedFile",
   1437             "iterencode", "iterdecode",
   1438             "BOM", "BOM_BE", "BOM_LE",
   1439             "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
   1440             "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
   1441             "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
   1442             "StreamReaderWriter", "StreamRecoder",
   1443         )
   1444         self.assertEqual(sorted(api), sorted(codecs.__all__))
   1445         for api in codecs.__all__:
   1446             getattr(codecs, api)
   1447 
   1448 class StreamReaderTest(unittest.TestCase):
   1449 
   1450     def setUp(self):
   1451         self.reader = codecs.getreader('utf-8')
   1452         self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
   1453 
   1454     def test_readlines(self):
   1455         f = self.reader(self.stream)
   1456         self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
   1457 
   1458 class EncodedFileTest(unittest.TestCase):
   1459 
   1460     def test_basic(self):
   1461         f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
   1462         ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
   1463         self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
   1464 
   1465         f = StringIO.StringIO()
   1466         ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
   1467         ef.write('\xc3\xbc')
   1468         self.assertEqual(f.getvalue(), '\xfc')
   1469 
   1470 class Str2StrTest(unittest.TestCase):
   1471 
   1472     def test_read(self):
   1473         sin = codecs.encode("\x80", "base64_codec")
   1474         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
   1475         sout = reader.read()
   1476         self.assertEqual(sout, "\x80")
   1477         self.assertIsInstance(sout, str)
   1478 
   1479     def test_readline(self):
   1480         sin = codecs.encode("\x80", "base64_codec")
   1481         reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
   1482         sout = reader.readline()
   1483         self.assertEqual(sout, "\x80")
   1484         self.assertIsInstance(sout, str)
   1485 
   1486 all_unicode_encodings = [
   1487     "ascii",
   1488     "base64_codec",
   1489     "big5",
   1490     "big5hkscs",
   1491     "charmap",
   1492     "cp037",
   1493     "cp1006",
   1494     "cp1026",
   1495     "cp1140",
   1496     "cp1250",
   1497     "cp1251",
   1498     "cp1252",
   1499     "cp1253",
   1500     "cp1254",
   1501     "cp1255",
   1502     "cp1256",
   1503     "cp1257",
   1504     "cp1258",
   1505     "cp424",
   1506     "cp437",
   1507     "cp500",
   1508     "cp720",
   1509     "cp737",
   1510     "cp775",
   1511     "cp850",
   1512     "cp852",
   1513     "cp855",
   1514     "cp856",
   1515     "cp857",
   1516     "cp858",
   1517     "cp860",
   1518     "cp861",
   1519     "cp862",
   1520     "cp863",
   1521     "cp864",
   1522     "cp865",
   1523     "cp866",
   1524     "cp869",
   1525     "cp874",
   1526     "cp875",
   1527     "cp932",
   1528     "cp949",
   1529     "cp950",
   1530     "euc_jis_2004",
   1531     "euc_jisx0213",
   1532     "euc_jp",
   1533     "euc_kr",
   1534     "gb18030",
   1535     "gb2312",
   1536     "gbk",
   1537     "hex_codec",
   1538     "hp_roman8",
   1539     "hz",
   1540     "idna",
   1541     "iso2022_jp",
   1542     "iso2022_jp_1",
   1543     "iso2022_jp_2",
   1544     "iso2022_jp_2004",
   1545     "iso2022_jp_3",
   1546     "iso2022_jp_ext",
   1547     "iso2022_kr",
   1548     "iso8859_1",
   1549     "iso8859_10",
   1550     "iso8859_11",
   1551     "iso8859_13",
   1552     "iso8859_14",
   1553     "iso8859_15",
   1554     "iso8859_16",
   1555     "iso8859_2",
   1556     "iso8859_3",
   1557     "iso8859_4",
   1558     "iso8859_5",
   1559     "iso8859_6",
   1560     "iso8859_7",
   1561     "iso8859_8",
   1562     "iso8859_9",
   1563     "johab",
   1564     "koi8_r",
   1565     "koi8_u",
   1566     "latin_1",
   1567     "mac_cyrillic",
   1568     "mac_greek",
   1569     "mac_iceland",
   1570     "mac_latin2",
   1571     "mac_roman",
   1572     "mac_turkish",
   1573     "palmos",
   1574     "ptcp154",
   1575     "punycode",
   1576     "raw_unicode_escape",
   1577     "rot_13",
   1578     "shift_jis",
   1579     "shift_jis_2004",
   1580     "shift_jisx0213",
   1581     "tis_620",
   1582     "unicode_escape",
   1583     "unicode_internal",
   1584     "utf_16",
   1585     "utf_16_be",
   1586     "utf_16_le",
   1587     "utf_7",
   1588     "utf_8",
   1589 ]
   1590 
   1591 if hasattr(codecs, "mbcs_encode"):
   1592     all_unicode_encodings.append("mbcs")
   1593 
   1594 # The following encodings work only with str, not unicode
   1595 all_string_encodings = [
   1596     "quopri_codec",
   1597     "string_escape",
   1598     "uu_codec",
   1599 ]
   1600 
   1601 # The following encoding is not tested, because it's not supposed
   1602 # to work:
   1603 #    "undefined"
   1604 
   1605 # The following encodings don't work in stateful mode
   1606 broken_unicode_with_streams = [
   1607     "base64_codec",
   1608     "hex_codec",
   1609     "punycode",
   1610     "unicode_internal"
   1611 ]
   1612 broken_incremental_coders = broken_unicode_with_streams[:]
   1613 
   1614 if sys.flags.py3k_warning:
   1615     broken_unicode_with_streams.append("rot_13")
   1616 
   1617 # The following encodings only support "strict" mode
   1618 only_strict_mode = [
   1619     "idna",
   1620     "zlib_codec",
   1621     "bz2_codec",
   1622 ]
   1623 
   1624 try:
   1625     import bz2
   1626 except ImportError:
   1627     pass
   1628 else:
   1629     all_unicode_encodings.append("bz2_codec")
   1630     broken_unicode_with_streams.append("bz2_codec")
   1631 
   1632 try:
   1633     import zlib
   1634 except ImportError:
   1635     pass
   1636 else:
   1637     all_unicode_encodings.append("zlib_codec")
   1638     broken_unicode_with_streams.append("zlib_codec")
   1639 
   1640 class BasicUnicodeTest(unittest.TestCase):
   1641     def test_basics(self):
   1642         s = u"abc123"  # all codecs should be able to encode these
   1643         for encoding in all_unicode_encodings:
   1644             name = codecs.lookup(encoding).name
   1645             if encoding.endswith("_codec"):
   1646                 name += "_codec"
   1647             elif encoding == "latin_1":
   1648                 name = "latin_1"
   1649             self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
   1650             (bytes, size) = codecs.getencoder(encoding)(s)
   1651             self.assertEqual(size, len(s), "encoding=%r" % encoding)
   1652             (chars, size) = codecs.getdecoder(encoding)(bytes)
   1653             self.assertEqual(chars, s, "encoding=%r" % encoding)
   1654 
   1655             if encoding not in broken_unicode_with_streams:
   1656                 # check stream reader/writer
   1657                 q = Queue()
   1658                 writer = codecs.getwriter(encoding)(q)
   1659                 encodedresult = ""
   1660                 for c in s:
   1661                     writer.write(c)
   1662                     encodedresult += q.read()
   1663                 q = Queue()
   1664                 reader = codecs.getreader(encoding)(q)
   1665                 decodedresult = u""
   1666                 for c in encodedresult:
   1667                     q.write(c)
   1668                     decodedresult += reader.read()
   1669                 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
   1670 
   1671             if encoding not in broken_incremental_coders:
   1672                 # check incremental decoder/encoder and iterencode()/iterdecode()
   1673                 try:
   1674                     encoder = codecs.getincrementalencoder(encoding)()
   1675                 except LookupError:  # no IncrementalEncoder
   1676                     pass
   1677                 else:
   1678                     # check incremental decoder/encoder
   1679                     encodedresult = ""
   1680                     for c in s:
   1681                         encodedresult += encoder.encode(c)
   1682                     encodedresult += encoder.encode(u"", True)
   1683                     decoder = codecs.getincrementaldecoder(encoding)()
   1684                     decodedresult = u""
   1685                     for c in encodedresult:
   1686                         decodedresult += decoder.decode(c)
   1687                     decodedresult += decoder.decode("", True)
   1688                     self.assertEqual(decodedresult, s,
   1689                                      "encoding=%r" % encoding)
   1690 
   1691                     # check iterencode()/iterdecode()
   1692                     result = u"".join(codecs.iterdecode(
   1693                             codecs.iterencode(s, encoding), encoding))
   1694                     self.assertEqual(result, s, "encoding=%r" % encoding)
   1695 
   1696                     # check iterencode()/iterdecode() with empty string
   1697                     result = u"".join(codecs.iterdecode(
   1698                             codecs.iterencode(u"", encoding), encoding))
   1699                     self.assertEqual(result, u"")
   1700 
   1701                 if encoding not in only_strict_mode:
   1702                     # check incremental decoder/encoder with errors argument
   1703                     try:
   1704                         encoder = codecs.getincrementalencoder(encoding)("ignore")
   1705                     except LookupError:  # no IncrementalEncoder
   1706                         pass
   1707                     else:
   1708                         encodedresult = "".join(encoder.encode(c) for c in s)
   1709                         decoder = codecs.getincrementaldecoder(encoding)("ignore")
   1710                         decodedresult = u"".join(decoder.decode(c)
   1711                                                  for c in encodedresult)
   1712                         self.assertEqual(decodedresult, s,
   1713                                          "encoding=%r" % encoding)
   1714 
   1715     @test_support.cpython_only
   1716     def test_basics_capi(self):
   1717         from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
   1718         s = u"abc123"  # all codecs should be able to encode these
   1719         for encoding in all_unicode_encodings:
   1720             if encoding not in broken_incremental_coders:
   1721                 # check incremental decoder/encoder and iterencode()/iterdecode()
   1722                 try:
   1723                     cencoder = codec_incrementalencoder(encoding)
   1724                 except LookupError:  # no IncrementalEncoder
   1725                     pass
   1726                 else:
   1727                     # check C API
   1728                     encodedresult = ""
   1729                     for c in s:
   1730                         encodedresult += cencoder.encode(c)
   1731                     encodedresult += cencoder.encode(u"", True)
   1732                     cdecoder = codec_incrementaldecoder(encoding)
   1733                     decodedresult = u""
   1734                     for c in encodedresult:
   1735                         decodedresult += cdecoder.decode(c)
   1736                     decodedresult += cdecoder.decode("", True)
   1737                     self.assertEqual(decodedresult, s,
   1738                                      "encoding=%r" % encoding)
   1739 
   1740                 if encoding not in only_strict_mode:
   1741                     # check incremental decoder/encoder with errors argument
   1742                     try:
   1743                         cencoder = codec_incrementalencoder(encoding, "ignore")
   1744                     except LookupError:  # no IncrementalEncoder
   1745                         pass
   1746                     else:
   1747                         encodedresult = "".join(cencoder.encode(c) for c in s)
   1748                         cdecoder = codec_incrementaldecoder(encoding, "ignore")
   1749                         decodedresult = u"".join(cdecoder.decode(c)
   1750                                                  for c in encodedresult)
   1751                         self.assertEqual(decodedresult, s,
   1752                                          "encoding=%r" % encoding)
   1753 
   1754     def test_seek(self):
   1755         # all codecs should be able to encode these
   1756         s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
   1757         for encoding in all_unicode_encodings:
   1758             if encoding == "idna": # FIXME: See SF bug #1163178
   1759                 continue
   1760             if encoding in broken_unicode_with_streams:
   1761                 continue
   1762             reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
   1763             for t in xrange(5):
   1764                 # Test that calling seek resets the internal codec state and buffers
   1765                 reader.seek(0, 0)
   1766                 line = reader.readline()
   1767                 self.assertEqual(s[:len(line)], line)
   1768 
   1769     def test_bad_decode_args(self):
   1770         for encoding in all_unicode_encodings:
   1771             decoder = codecs.getdecoder(encoding)
   1772             self.assertRaises(TypeError, decoder)
   1773             if encoding not in ("idna", "punycode"):
   1774                 self.assertRaises(TypeError, decoder, 42)
   1775 
   1776     def test_bad_encode_args(self):
   1777         for encoding in all_unicode_encodings:
   1778             encoder = codecs.getencoder(encoding)
   1779             self.assertRaises(TypeError, encoder)
   1780 
   1781     def test_encoding_map_type_initialized(self):
   1782         from encodings import cp1140
   1783         # This used to crash, we are only verifying there's no crash.
   1784         table_type = type(cp1140.encoding_table)
   1785         self.assertEqual(table_type, table_type)
   1786 
   1787 class BasicStrTest(unittest.TestCase):
   1788     def test_basics(self):
   1789         s = "abc123"
   1790         for encoding in all_string_encodings:
   1791             (bytes, size) = codecs.getencoder(encoding)(s)
   1792             self.assertEqual(size, len(s))
   1793             (chars, size) = codecs.getdecoder(encoding)(bytes)
   1794             self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
   1795 
   1796 class CharmapTest(unittest.TestCase):
   1797     def test_decode_with_string_map(self):
   1798         self.assertEqual(
   1799             codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
   1800             (u"abc", 3)
   1801         )
   1802 
   1803         self.assertRaises(UnicodeDecodeError,
   1804             codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
   1805         )
   1806 
   1807         self.assertRaises(UnicodeDecodeError,
   1808             codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
   1809         )
   1810 
   1811         self.assertEqual(
   1812             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
   1813             (u"ab\ufffd", 3)
   1814         )
   1815 
   1816         self.assertEqual(
   1817             codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
   1818             (u"ab\ufffd", 3)
   1819         )
   1820 
   1821         self.assertEqual(
   1822             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
   1823             (u"ab", 3)
   1824         )
   1825 
   1826         self.assertEqual(
   1827             codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
   1828             (u"ab", 3)
   1829         )
   1830 
   1831         allbytes = "".join(chr(i) for i in xrange(256))
   1832         self.assertEqual(
   1833             codecs.charmap_decode(allbytes, "ignore", u""),
   1834             (u"", len(allbytes))
   1835         )
   1836 
   1837     def test_decode_with_int2str_map(self):
   1838         self.assertEqual(
   1839             codecs.charmap_decode("\x00\x01\x02", "strict",
   1840                                   {0: u'a', 1: u'b', 2: u'c'}),
   1841             (u"abc", 3)
   1842         )
   1843 
   1844         self.assertEqual(
   1845             codecs.charmap_decode("\x00\x01\x02", "strict",
   1846                                   {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
   1847             (u"AaBbCc", 3)
   1848         )
   1849 
   1850         self.assertEqual(
   1851             codecs.charmap_decode("\x00\x01\x02", "strict",
   1852                                   {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
   1853             (u"\U0010FFFFbc", 3)
   1854         )
   1855 
   1856         self.assertEqual(
   1857             codecs.charmap_decode("\x00\x01\x02", "strict",
   1858                                   {0: u'a', 1: u'b', 2: u''}),
   1859             (u"ab", 3)
   1860         )
   1861 
   1862         self.assertRaises(UnicodeDecodeError,
   1863             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1864                                    {0: u'a', 1: u'b'}
   1865         )
   1866 
   1867         self.assertRaises(UnicodeDecodeError,
   1868             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1869                                    {0: u'a', 1: u'b', 2: None}
   1870         )
   1871 
   1872         # Issue #14850
   1873         self.assertRaises(UnicodeDecodeError,
   1874             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1875                                    {0: u'a', 1: u'b', 2: u'\ufffe'}
   1876         )
   1877 
   1878         self.assertEqual(
   1879             codecs.charmap_decode("\x00\x01\x02", "replace",
   1880                                   {0: u'a', 1: u'b'}),
   1881             (u"ab\ufffd", 3)
   1882         )
   1883 
   1884         self.assertEqual(
   1885             codecs.charmap_decode("\x00\x01\x02", "replace",
   1886                                   {0: u'a', 1: u'b', 2: None}),
   1887             (u"ab\ufffd", 3)
   1888         )
   1889 
   1890         # Issue #14850
   1891         self.assertEqual(
   1892             codecs.charmap_decode("\x00\x01\x02", "replace",
   1893                                   {0: u'a', 1: u'b', 2: u'\ufffe'}),
   1894             (u"ab\ufffd", 3)
   1895         )
   1896 
   1897         self.assertEqual(
   1898             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1899                                   {0: u'a', 1: u'b'}),
   1900             (u"ab", 3)
   1901         )
   1902 
   1903         self.assertEqual(
   1904             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1905                                   {0: u'a', 1: u'b', 2: None}),
   1906             (u"ab", 3)
   1907         )
   1908 
   1909         # Issue #14850
   1910         self.assertEqual(
   1911             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1912                                   {0: u'a', 1: u'b', 2: u'\ufffe'}),
   1913             (u"ab", 3)
   1914         )
   1915 
   1916         allbytes = "".join(chr(i) for i in xrange(256))
   1917         self.assertEqual(
   1918             codecs.charmap_decode(allbytes, "ignore", {}),
   1919             (u"", len(allbytes))
   1920         )
   1921 
   1922     def test_decode_with_int2int_map(self):
   1923         a = ord(u'a')
   1924         b = ord(u'b')
   1925         c = ord(u'c')
   1926 
   1927         self.assertEqual(
   1928             codecs.charmap_decode("\x00\x01\x02", "strict",
   1929                                   {0: a, 1: b, 2: c}),
   1930             (u"abc", 3)
   1931         )
   1932 
   1933         # Issue #15379
   1934         self.assertEqual(
   1935             codecs.charmap_decode("\x00\x01\x02", "strict",
   1936                                   {0: 0x10FFFF, 1: b, 2: c}),
   1937             (u"\U0010FFFFbc", 3)
   1938         )
   1939 
   1940         self.assertRaises(TypeError,
   1941             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1942                                    {0: 0x110000, 1: b, 2: c}
   1943         )
   1944 
   1945         self.assertRaises(UnicodeDecodeError,
   1946             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1947                                    {0: a, 1: b},
   1948         )
   1949 
   1950         self.assertRaises(UnicodeDecodeError,
   1951             codecs.charmap_decode, "\x00\x01\x02", "strict",
   1952                                    {0: a, 1: b, 2: 0xFFFE},
   1953         )
   1954 
   1955         self.assertEqual(
   1956             codecs.charmap_decode("\x00\x01\x02", "replace",
   1957                                   {0: a, 1: b}),
   1958             (u"ab\ufffd", 3)
   1959         )
   1960 
   1961         self.assertEqual(
   1962             codecs.charmap_decode("\x00\x01\x02", "replace",
   1963                                   {0: a, 1: b, 2: 0xFFFE}),
   1964             (u"ab\ufffd", 3)
   1965         )
   1966 
   1967         self.assertEqual(
   1968             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1969                                   {0: a, 1: b}),
   1970             (u"ab", 3)
   1971         )
   1972 
   1973         self.assertEqual(
   1974             codecs.charmap_decode("\x00\x01\x02", "ignore",
   1975                                   {0: a, 1: b, 2: 0xFFFE}),
   1976             (u"ab", 3)
   1977         )
   1978 
   1979 
   1980 class WithStmtTest(unittest.TestCase):
   1981     def test_encodedfile(self):
   1982         f = StringIO.StringIO("\xc3\xbc")
   1983         with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
   1984             self.assertEqual(ef.read(), "\xfc")
   1985 
   1986     def test_streamreaderwriter(self):
   1987         f = StringIO.StringIO("\xc3\xbc")
   1988         info = codecs.lookup("utf-8")
   1989         with codecs.StreamReaderWriter(f, info.streamreader,
   1990                                        info.streamwriter, 'strict') as srw:
   1991             self.assertEqual(srw.read(), u"\xfc")
   1992 
   1993 
   1994 class UnicodeEscapeTest(unittest.TestCase):
   1995     def test_empty(self):
   1996         self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
   1997         self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
   1998 
   1999     def test_raw_encode(self):
   2000         encode = codecs.unicode_escape_encode
   2001         for b in range(32, 127):
   2002             if b != ord('\\'):
   2003                 self.assertEqual(encode(unichr(b)), (chr(b), 1))
   2004 
   2005     def test_raw_decode(self):
   2006         decode = codecs.unicode_escape_decode
   2007         for b in range(256):
   2008             if b != ord('\\'):
   2009                 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
   2010 
   2011     def test_escape_encode(self):
   2012         encode = codecs.unicode_escape_encode
   2013         check = coding_checker(self, encode)
   2014         check(u'\t', r'\t')
   2015         check(u'\n', r'\n')
   2016         check(u'\r', r'\r')
   2017         check(u'\\', r'\\')
   2018         for b in range(32):
   2019             if chr(b) not in '\t\n\r':
   2020                 check(unichr(b), '\\x%02x' % b)
   2021         for b in range(127, 256):
   2022             check(unichr(b), '\\x%02x' % b)
   2023         check(u'\u20ac', r'\u20ac')
   2024         check(u'\U0001d120', r'\U0001d120')
   2025 
   2026     def test_escape_decode(self):
   2027         decode = codecs.unicode_escape_decode
   2028         check = coding_checker(self, decode)
   2029         check("[\\\n]", u"[]")
   2030         check(r'[\"]', u'["]')
   2031         check(r"[\']", u"[']")
   2032         check(r"[\\]", ur"[\]")
   2033         check(r"[\a]", u"[\x07]")
   2034         check(r"[\b]", u"[\x08]")
   2035         check(r"[\t]", u"[\x09]")
   2036         check(r"[\n]", u"[\x0a]")
   2037         check(r"[\v]", u"[\x0b]")
   2038         check(r"[\f]", u"[\x0c]")
   2039         check(r"[\r]", u"[\x0d]")
   2040         check(r"[\7]", u"[\x07]")
   2041         check(r"[\8]", ur"[\8]")
   2042         check(r"[\78]", u"[\x078]")
   2043         check(r"[\41]", u"[!]")
   2044         check(r"[\418]", u"[!8]")
   2045         check(r"[\101]", u"[A]")
   2046         check(r"[\1010]", u"[A0]")
   2047         check(r"[\x41]", u"[A]")
   2048         check(r"[\x410]", u"[A0]")
   2049         check(r"\u20ac", u"\u20ac")
   2050         check(r"\U0001d120", u"\U0001d120")
   2051         for b in range(256):
   2052             if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
   2053                 check('\\' + chr(b), u'\\' + unichr(b))
   2054 
   2055     def test_decode_errors(self):
   2056         decode = codecs.unicode_escape_decode
   2057         for c, d in ('x', 2), ('u', 4), ('U', 4):
   2058             for i in range(d):
   2059                 self.assertRaises(UnicodeDecodeError, decode,
   2060                                   "\\" + c + "0"*i)
   2061                 self.assertRaises(UnicodeDecodeError, decode,
   2062                                   "[\\" + c + "0"*i + "]")
   2063                 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
   2064                 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
   2065                 self.assertEqual(decode(data, "replace"),
   2066                                  (u"[\ufffd]\ufffd", len(data)))
   2067         self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
   2068         self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
   2069         self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
   2070 
   2071 
   2072 class RawUnicodeEscapeTest(unittest.TestCase):
   2073     def test_empty(self):
   2074         self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
   2075         self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
   2076 
   2077     def test_raw_encode(self):
   2078         encode = codecs.raw_unicode_escape_encode
   2079         for b in range(256):
   2080             self.assertEqual(encode(unichr(b)), (chr(b), 1))
   2081 
   2082     def test_raw_decode(self):
   2083         decode = codecs.raw_unicode_escape_decode
   2084         for b in range(256):
   2085             self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
   2086 
   2087     def test_escape_encode(self):
   2088         encode = codecs.raw_unicode_escape_encode
   2089         check = coding_checker(self, encode)
   2090         for b in range(256):
   2091             if chr(b) not in 'uU':
   2092                 check(u'\\' + unichr(b), '\\' + chr(b))
   2093         check(u'\u20ac', r'\u20ac')
   2094         check(u'\U0001d120', r'\U0001d120')
   2095 
   2096     def test_escape_decode(self):
   2097         decode = codecs.raw_unicode_escape_decode
   2098         check = coding_checker(self, decode)
   2099         for b in range(256):
   2100             if chr(b) not in 'uU':
   2101                 check('\\' + chr(b), u'\\' + unichr(b))
   2102         check(r"\u20ac", u"\u20ac")
   2103         check(r"\U0001d120", u"\U0001d120")
   2104 
   2105     def test_decode_errors(self):
   2106         decode = codecs.raw_unicode_escape_decode
   2107         for c, d in ('u', 4), ('U', 4):
   2108             for i in range(d):
   2109                 self.assertRaises(UnicodeDecodeError, decode,
   2110                                   "\\" + c + "0"*i)
   2111                 self.assertRaises(UnicodeDecodeError, decode,
   2112                                   "[\\" + c + "0"*i + "]")
   2113                 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
   2114                 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
   2115                 self.assertEqual(decode(data, "replace"),
   2116                                  (u"[\ufffd]\ufffd", len(data)))
   2117         self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
   2118         self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
   2119         self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
   2120 
   2121 
   2122 class BomTest(unittest.TestCase):
   2123     def test_seek0(self):
   2124         data = u"1234567890"
   2125         tests = ("utf-16",
   2126                  "utf-16-le",
   2127                  "utf-16-be",
   2128                  "utf-32",
   2129                  "utf-32-le",
   2130                  "utf-32-be")
   2131         self.addCleanup(test_support.unlink, test_support.TESTFN)
   2132         for encoding in tests:
   2133             # Check if the BOM is written only once
   2134             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   2135                 f.write(data)
   2136                 f.write(data)
   2137                 f.seek(0)
   2138                 self.assertEqual(f.read(), data * 2)
   2139                 f.seek(0)
   2140                 self.assertEqual(f.read(), data * 2)
   2141 
   2142             # Check that the BOM is written after a seek(0)
   2143             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   2144                 f.write(data[0])
   2145                 self.assertNotEqual(f.tell(), 0)
   2146                 f.seek(0)
   2147                 f.write(data)
   2148                 f.seek(0)
   2149                 self.assertEqual(f.read(), data)
   2150 
   2151             # (StreamWriter) Check that the BOM is written after a seek(0)
   2152             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   2153                 f.writer.write(data[0])
   2154                 self.assertNotEqual(f.writer.tell(), 0)
   2155                 f.writer.seek(0)
   2156                 f.writer.write(data)
   2157                 f.seek(0)
   2158                 self.assertEqual(f.read(), data)
   2159 
   2160             # Check that the BOM is not written after a seek() at a position
   2161             # different than the start
   2162             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   2163                 f.write(data)
   2164                 f.seek(f.tell())
   2165                 f.write(data)
   2166                 f.seek(0)
   2167                 self.assertEqual(f.read(), data * 2)
   2168 
   2169             # (StreamWriter) Check that the BOM is not written after a seek()
   2170             # at a position different than the start
   2171             with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
   2172                 f.writer.write(data)
   2173                 f.writer.seek(f.writer.tell())
   2174                 f.writer.write(data)
   2175                 f.seek(0)
   2176                 self.assertEqual(f.read(), data * 2)
   2177 
   2178 
   2179 class TransformCodecTest(unittest.TestCase):
   2180 
   2181     def test_quopri_stateless(self):
   2182         # Should encode with quotetabs=True
   2183         encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
   2184         self.assertEqual(encoded, b"space=20tab=09eol=20\n")
   2185         # But should still support unescaped tabs and spaces
   2186         unescaped = b"space tab eol\n"
   2187         self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
   2188 
   2189     def test_uu_invalid(self):
   2190         # Missing "begin" line
   2191         self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
   2192 
   2193 
   2194 def test_main():
   2195     test_support.run_unittest(
   2196         UTF32Test,
   2197         UTF32LETest,
   2198         UTF32BETest,
   2199         UTF16Test,
   2200         UTF16LETest,
   2201         UTF16BETest,
   2202         UTF8Test,
   2203         UTF8SigTest,
   2204         UTF7Test,
   2205         UTF16ExTest,
   2206         ReadBufferTest,
   2207         CharBufferTest,
   2208         EscapeDecodeTest,
   2209         RecodingTest,
   2210         PunycodeTest,
   2211         UnicodeInternalTest,
   2212         NameprepTest,
   2213         IDNACodecTest,
   2214         CodecsModuleTest,
   2215         StreamReaderTest,
   2216         EncodedFileTest,
   2217         Str2StrTest,
   2218         BasicUnicodeTest,
   2219         BasicStrTest,
   2220         CharmapTest,
   2221         WithStmtTest,
   2222         UnicodeEscapeTest,
   2223         RawUnicodeEscapeTest,
   2224         BomTest,
   2225         TransformCodecTest,
   2226     )
   2227 
   2228 
   2229 if __name__ == "__main__":
   2230     test_main()
   2231