Home | History | Annotate | Download | only in test
      1 import codecs
      2 import html.entities
      3 import sys
      4 import test.support
      5 import unicodedata
      6 import unittest
      7 
      8 class PosReturn:
      9     # this can be used for configurable callbacks
     10 
     11     def __init__(self):
     12         self.pos = 0
     13 
     14     def handle(self, exc):
     15         oldpos = self.pos
     16         realpos = oldpos
     17         if realpos<0:
     18             realpos = len(exc.object) + realpos
     19         # if we don't advance this time, terminate on the next call
     20         # otherwise we'd get an endless loop
     21         if realpos <= exc.start:
     22             self.pos = len(exc.object)
     23         return ("<?>", oldpos)
     24 
     25 # A UnicodeEncodeError object with a bad start attribute
     26 class BadStartUnicodeEncodeError(UnicodeEncodeError):
     27     def __init__(self):
     28         UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
     29         self.start = []
     30 
     31 # A UnicodeEncodeError object with a bad object attribute
     32 class BadObjectUnicodeEncodeError(UnicodeEncodeError):
     33     def __init__(self):
     34         UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
     35         self.object = []
     36 
     37 # A UnicodeDecodeError object without an end attribute
     38 class NoEndUnicodeDecodeError(UnicodeDecodeError):
     39     def __init__(self):
     40         UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
     41         del self.end
     42 
     43 # A UnicodeDecodeError object with a bad object attribute
     44 class BadObjectUnicodeDecodeError(UnicodeDecodeError):
     45     def __init__(self):
     46         UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
     47         self.object = []
     48 
     49 # A UnicodeTranslateError object without a start attribute
     50 class NoStartUnicodeTranslateError(UnicodeTranslateError):
     51     def __init__(self):
     52         UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
     53         del self.start
     54 
     55 # A UnicodeTranslateError object without an end attribute
     56 class NoEndUnicodeTranslateError(UnicodeTranslateError):
     57     def __init__(self):
     58         UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
     59         del self.end
     60 
     61 # A UnicodeTranslateError object without an object attribute
     62 class NoObjectUnicodeTranslateError(UnicodeTranslateError):
     63     def __init__(self):
     64         UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
     65         del self.object
     66 
     67 class CodecCallbackTest(unittest.TestCase):
     68 
     69     def test_xmlcharrefreplace(self):
     70         # replace unencodable characters which numeric character entities.
     71         # For ascii, latin-1 and charmaps this is completely implemented
     72         # in C and should be reasonably fast.
     73         s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
     74         self.assertEqual(
     75             s.encode("ascii", "xmlcharrefreplace"),
     76             b"&#12473;&#12497;&#12514; &#228;nd eggs"
     77         )
     78         self.assertEqual(
     79             s.encode("latin-1", "xmlcharrefreplace"),
     80             b"&#12473;&#12497;&#12514; \xe4nd eggs"
     81         )
     82 
     83     def test_xmlcharnamereplace(self):
     84         # This time use a named character entity for unencodable
     85         # characters, if one is available.
     86 
     87         def xmlcharnamereplace(exc):
     88             if not isinstance(exc, UnicodeEncodeError):
     89                 raise TypeError("don't know how to handle %r" % exc)
     90             l = []
     91             for c in exc.object[exc.start:exc.end]:
     92                 try:
     93                     l.append("&%s;" % html.entities.codepoint2name[ord(c)])
     94                 except KeyError:
     95                     l.append("&#%d;" % ord(c))
     96             return ("".join(l), exc.end)
     97 
     98         codecs.register_error(
     99             "test.xmlcharnamereplace", xmlcharnamereplace)
    100 
    101         sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
    102         sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
    103         self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
    104         sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
    105         self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
    106         sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
    107         self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
    108 
    109     def test_uninamereplace(self):
    110         # We're using the names from the unicode database this time,
    111         # and we're doing "syntax highlighting" here, i.e. we include
    112         # the replaced text in ANSI escape sequences. For this it is
    113         # useful that the error handler is not called for every single
    114         # unencodable character, but for a complete sequence of
    115         # unencodable characters, otherwise we would output many
    116         # unnecessary escape sequences.
    117 
    118         def uninamereplace(exc):
    119             if not isinstance(exc, UnicodeEncodeError):
    120                 raise TypeError("don't know how to handle %r" % exc)
    121             l = []
    122             for c in exc.object[exc.start:exc.end]:
    123                 l.append(unicodedata.name(c, "0x%x" % ord(c)))
    124             return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
    125 
    126         codecs.register_error(
    127             "test.uninamereplace", uninamereplace)
    128 
    129         sin = "\xac\u1234\u20ac\u8000"
    130         sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
    131         self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
    132 
    133         sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
    134         self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
    135 
    136         sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
    137         self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
    138 
    139     def test_backslashescape(self):
    140         # Does the same as the "unicode-escape" encoding, but with different
    141         # base encodings.
    142         sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
    143         sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
    144         self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
    145 
    146         sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
    147         self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
    148 
    149         sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
    150         self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
    151 
    152     def test_nameescape(self):
    153         # Does the same as backslashescape, but prefers ``\N{...}`` escape
    154         # sequences.
    155         sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
    156         sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
    157                 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
    158         self.assertEqual(sin.encode("ascii", "namereplace"), sout)
    159 
    160         sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
    161                 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
    162         self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
    163 
    164         sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
    165                 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
    166         self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
    167 
    168     def test_decoding_callbacks(self):
    169         # This is a test for a decoding callback handler
    170         # that allows the decoding of the invalid sequence
    171         # "\xc0\x80" and returns "\x00" instead of raising an error.
    172         # All other illegal sequences will be handled strictly.
    173         def relaxedutf8(exc):
    174             if not isinstance(exc, UnicodeDecodeError):
    175                 raise TypeError("don't know how to handle %r" % exc)
    176             if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
    177                 return ("\x00", exc.start+2) # retry after two bytes
    178             else:
    179                 raise exc
    180 
    181         codecs.register_error("test.relaxedutf8", relaxedutf8)
    182 
    183         # all the "\xc0\x80" will be decoded to "\x00"
    184         sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
    185         sout = "a\x00b\x00c\xfc\x00\x00"
    186         self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
    187 
    188         # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
    189         sin = b"\xc0\x80\xc0\x81"
    190         self.assertRaises(UnicodeDecodeError, sin.decode,
    191                           "utf-8", "test.relaxedutf8")
    192 
    193     def test_charmapencode(self):
    194         # For charmap encodings the replacement string will be
    195         # mapped through the encoding again. This means, that
    196         # to be able to use e.g. the "replace" handler, the
    197         # charmap has to have a mapping for "?".
    198         charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
    199         sin = "abc"
    200         sout = b"AABBCC"
    201         self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
    202 
    203         sin = "abcA"
    204         self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
    205 
    206         charmap[ord("?")] = b"XYZ"
    207         sin = "abcDEF"
    208         sout = b"AABBCCXYZXYZXYZ"
    209         self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
    210 
    211         charmap[ord("?")] = "XYZ" # wrong type in mapping
    212         self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
    213 
    214     def test_decodeunicodeinternal(self):
    215         with test.support.check_warnings(('unicode_internal codec has been '
    216                                           'deprecated', DeprecationWarning)):
    217             self.assertRaises(
    218                 UnicodeDecodeError,
    219                 b"\x00\x00\x00\x00\x00".decode,
    220                 "unicode-internal",
    221             )
    222             if len('\0'.encode('unicode-internal')) == 4:
    223                 def handler_unicodeinternal(exc):
    224                     if not isinstance(exc, UnicodeDecodeError):
    225                         raise TypeError("don't know how to handle %r" % exc)
    226                     return ("\x01", 1)
    227 
    228                 self.assertEqual(
    229                     b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
    230                     "\u0000"
    231                 )
    232 
    233                 self.assertEqual(
    234                     b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
    235                     "\u0000\ufffd"
    236                 )
    237 
    238                 self.assertEqual(
    239                     b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"),
    240                     "\u0000\\x00"
    241                 )
    242 
    243                 codecs.register_error("test.hui", handler_unicodeinternal)
    244 
    245                 self.assertEqual(
    246                     b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
    247                     "\u0000\u0001\u0000"
    248                 )
    249 
    250     def test_callbacks(self):
    251         def handler1(exc):
    252             r = range(exc.start, exc.end)
    253             if isinstance(exc, UnicodeEncodeError):
    254                 l = ["<%d>" % ord(exc.object[pos]) for pos in r]
    255             elif isinstance(exc, UnicodeDecodeError):
    256                 l = ["<%d>" % exc.object[pos] for pos in r]
    257             else:
    258                 raise TypeError("don't know how to handle %r" % exc)
    259             return ("[%s]" % "".join(l), exc.end)
    260 
    261         codecs.register_error("test.handler1", handler1)
    262 
    263         def handler2(exc):
    264             if not isinstance(exc, UnicodeDecodeError):
    265                 raise TypeError("don't know how to handle %r" % exc)
    266             l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
    267             return ("[%s]" % "".join(l), exc.end+1) # skip one character
    268 
    269         codecs.register_error("test.handler2", handler2)
    270 
    271         s = b"\x00\x81\x7f\x80\xff"
    272 
    273         self.assertEqual(
    274             s.decode("ascii", "test.handler1"),
    275             "\x00[<129>]\x7f[<128>][<255>]"
    276         )
    277         self.assertEqual(
    278             s.decode("ascii", "test.handler2"),
    279             "\x00[<129>][<128>]"
    280         )
    281 
    282         self.assertEqual(
    283             b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
    284             "\u3042[<92><117><51>]xxx"
    285         )
    286 
    287         self.assertEqual(
    288             b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
    289             "\u3042[<92><117><51>]xx"
    290         )
    291 
    292         self.assertEqual(
    293             codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
    294             "z[<98>][<99>]"
    295         )
    296 
    297         self.assertEqual(
    298             "g\xfc\xdfrk".encode("ascii", "test.handler1"),
    299             b"g[<252><223>]rk"
    300         )
    301 
    302         self.assertEqual(
    303             "g\xfc\xdf".encode("ascii", "test.handler1"),
    304             b"g[<252><223>]"
    305         )
    306 
    307     def test_longstrings(self):
    308         # test long strings to check for memory overflow problems
    309         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
    310                    "backslashreplace", "namereplace"]
    311         # register the handlers under different names,
    312         # to prevent the codec from recognizing the name
    313         for err in errors:
    314             codecs.register_error("test." + err, codecs.lookup_error(err))
    315         l = 1000
    316         errors += [ "test." + err for err in errors ]
    317         for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
    318             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
    319                         "utf-8", "utf-7", "utf-16", "utf-32"):
    320                 for err in errors:
    321                     try:
    322                         uni.encode(enc, err)
    323                     except UnicodeError:
    324                         pass
    325 
    326     def check_exceptionobjectargs(self, exctype, args, msg):
    327         # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
    328         # check with one missing argument
    329         self.assertRaises(TypeError, exctype, *args[:-1])
    330         # check with one argument too much
    331         self.assertRaises(TypeError, exctype, *(args + ["too much"]))
    332         # check with one argument of the wrong type
    333         wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
    334         for i in range(len(args)):
    335             for wrongarg in wrongargs:
    336                 if type(wrongarg) is type(args[i]):
    337                     continue
    338                 # build argument array
    339                 callargs = []
    340                 for j in range(len(args)):
    341                     if i==j:
    342                         callargs.append(wrongarg)
    343                     else:
    344                         callargs.append(args[i])
    345                 self.assertRaises(TypeError, exctype, *callargs)
    346 
    347         # check with the correct number and type of arguments
    348         exc = exctype(*args)
    349         self.assertEqual(str(exc), msg)
    350 
    351     def test_unicodeencodeerror(self):
    352         self.check_exceptionobjectargs(
    353             UnicodeEncodeError,
    354             ["ascii", "g\xfcrk", 1, 2, "ouch"],
    355             "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
    356         )
    357         self.check_exceptionobjectargs(
    358             UnicodeEncodeError,
    359             ["ascii", "g\xfcrk", 1, 4, "ouch"],
    360             "'ascii' codec can't encode characters in position 1-3: ouch"
    361         )
    362         self.check_exceptionobjectargs(
    363             UnicodeEncodeError,
    364             ["ascii", "\xfcx", 0, 1, "ouch"],
    365             "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
    366         )
    367         self.check_exceptionobjectargs(
    368             UnicodeEncodeError,
    369             ["ascii", "\u0100x", 0, 1, "ouch"],
    370             "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
    371         )
    372         self.check_exceptionobjectargs(
    373             UnicodeEncodeError,
    374             ["ascii", "\uffffx", 0, 1, "ouch"],
    375             "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
    376         )
    377         self.check_exceptionobjectargs(
    378             UnicodeEncodeError,
    379             ["ascii", "\U00010000x", 0, 1, "ouch"],
    380             "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
    381         )
    382 
    383     def test_unicodedecodeerror(self):
    384         self.check_exceptionobjectargs(
    385             UnicodeDecodeError,
    386             ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
    387             "'ascii' codec can't decode byte 0xfc in position 1: ouch"
    388         )
    389         self.check_exceptionobjectargs(
    390             UnicodeDecodeError,
    391             ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
    392             "'ascii' codec can't decode bytes in position 1-2: ouch"
    393         )
    394 
    395     def test_unicodetranslateerror(self):
    396         self.check_exceptionobjectargs(
    397             UnicodeTranslateError,
    398             ["g\xfcrk", 1, 2, "ouch"],
    399             "can't translate character '\\xfc' in position 1: ouch"
    400         )
    401         self.check_exceptionobjectargs(
    402             UnicodeTranslateError,
    403             ["g\u0100rk", 1, 2, "ouch"],
    404             "can't translate character '\\u0100' in position 1: ouch"
    405         )
    406         self.check_exceptionobjectargs(
    407             UnicodeTranslateError,
    408             ["g\uffffrk", 1, 2, "ouch"],
    409             "can't translate character '\\uffff' in position 1: ouch"
    410         )
    411         self.check_exceptionobjectargs(
    412             UnicodeTranslateError,
    413             ["g\U00010000rk", 1, 2, "ouch"],
    414             "can't translate character '\\U00010000' in position 1: ouch"
    415         )
    416         self.check_exceptionobjectargs(
    417             UnicodeTranslateError,
    418             ["g\xfcrk", 1, 3, "ouch"],
    419             "can't translate characters in position 1-2: ouch"
    420         )
    421 
    422     def test_badandgoodstrictexceptions(self):
    423         # "strict" complains about a non-exception passed in
    424         self.assertRaises(
    425             TypeError,
    426             codecs.strict_errors,
    427             42
    428         )
    429         # "strict" complains about the wrong exception type
    430         self.assertRaises(
    431             Exception,
    432             codecs.strict_errors,
    433             Exception("ouch")
    434         )
    435 
    436         # If the correct exception is passed in, "strict" raises it
    437         self.assertRaises(
    438             UnicodeEncodeError,
    439             codecs.strict_errors,
    440             UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
    441         )
    442         self.assertRaises(
    443             UnicodeDecodeError,
    444             codecs.strict_errors,
    445             UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
    446         )
    447         self.assertRaises(
    448             UnicodeTranslateError,
    449             codecs.strict_errors,
    450             UnicodeTranslateError("\u3042", 0, 1, "ouch")
    451         )
    452 
    453     def test_badandgoodignoreexceptions(self):
    454         # "ignore" complains about a non-exception passed in
    455         self.assertRaises(
    456            TypeError,
    457            codecs.ignore_errors,
    458            42
    459         )
    460         # "ignore" complains about the wrong exception type
    461         self.assertRaises(
    462            TypeError,
    463            codecs.ignore_errors,
    464            UnicodeError("ouch")
    465         )
    466         # If the correct exception is passed in, "ignore" returns an empty replacement
    467         self.assertEqual(
    468             codecs.ignore_errors(
    469                 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
    470             ("", 2)
    471         )
    472         self.assertEqual(
    473             codecs.ignore_errors(
    474                 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
    475             ("", 2)
    476         )
    477         self.assertEqual(
    478             codecs.ignore_errors(
    479                 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
    480             ("", 2)
    481         )
    482 
    483     def test_badandgoodreplaceexceptions(self):
    484         # "replace" complains about a non-exception passed in
    485         self.assertRaises(
    486            TypeError,
    487            codecs.replace_errors,
    488            42
    489         )
    490         # "replace" complains about the wrong exception type
    491         self.assertRaises(
    492            TypeError,
    493            codecs.replace_errors,
    494            UnicodeError("ouch")
    495         )
    496         self.assertRaises(
    497             TypeError,
    498             codecs.replace_errors,
    499             BadObjectUnicodeEncodeError()
    500         )
    501         self.assertRaises(
    502             TypeError,
    503             codecs.replace_errors,
    504             BadObjectUnicodeDecodeError()
    505         )
    506         # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
    507         self.assertEqual(
    508             codecs.replace_errors(
    509                 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
    510             ("?", 2)
    511         )
    512         self.assertEqual(
    513             codecs.replace_errors(
    514                 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
    515             ("\ufffd", 2)
    516         )
    517         self.assertEqual(
    518             codecs.replace_errors(
    519                 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
    520             ("\ufffd", 2)
    521         )
    522 
    523     def test_badandgoodxmlcharrefreplaceexceptions(self):
    524         # "xmlcharrefreplace" complains about a non-exception passed in
    525         self.assertRaises(
    526            TypeError,
    527            codecs.xmlcharrefreplace_errors,
    528            42
    529         )
    530         # "xmlcharrefreplace" complains about the wrong exception types
    531         self.assertRaises(
    532            TypeError,
    533            codecs.xmlcharrefreplace_errors,
    534            UnicodeError("ouch")
    535         )
    536         # "xmlcharrefreplace" can only be used for encoding
    537         self.assertRaises(
    538             TypeError,
    539             codecs.xmlcharrefreplace_errors,
    540             UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
    541         )
    542         self.assertRaises(
    543             TypeError,
    544             codecs.xmlcharrefreplace_errors,
    545             UnicodeTranslateError("\u3042", 0, 1, "ouch")
    546         )
    547         # Use the correct exception
    548         cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
    549               999999, 1000000)
    550         cs += (0xd800, 0xdfff)
    551         s = "".join(chr(c) for c in cs)
    552         self.assertEqual(
    553             codecs.xmlcharrefreplace_errors(
    554                 UnicodeEncodeError("ascii", "a" + s + "b",
    555                                    1, 1 + len(s), "ouch")
    556             ),
    557             ("".join("&#%d;" % c for c in cs), 1 + len(s))
    558         )
    559 
    560     def test_badandgoodbackslashreplaceexceptions(self):
    561         # "backslashreplace" complains about a non-exception passed in
    562         self.assertRaises(
    563            TypeError,
    564            codecs.backslashreplace_errors,
    565            42
    566         )
    567         # "backslashreplace" complains about the wrong exception types
    568         self.assertRaises(
    569            TypeError,
    570            codecs.backslashreplace_errors,
    571            UnicodeError("ouch")
    572         )
    573         # Use the correct exception
    574         tests = [
    575             ("\u3042", "\\u3042"),
    576             ("\n", "\\x0a"),
    577             ("a", "\\x61"),
    578             ("\x00", "\\x00"),
    579             ("\xff", "\\xff"),
    580             ("\u0100", "\\u0100"),
    581             ("\uffff", "\\uffff"),
    582             ("\U00010000", "\\U00010000"),
    583             ("\U0010ffff", "\\U0010ffff"),
    584             # Lone surrogates
    585             ("\ud800", "\\ud800"),
    586             ("\udfff", "\\udfff"),
    587             ("\ud800\udfff", "\\ud800\\udfff"),
    588         ]
    589         for s, r in tests:
    590             with self.subTest(str=s):
    591                 self.assertEqual(
    592                     codecs.backslashreplace_errors(
    593                         UnicodeEncodeError("ascii", "a" + s + "b",
    594                                            1, 1 + len(s), "ouch")),
    595                     (r, 1 + len(s))
    596                 )
    597                 self.assertEqual(
    598                     codecs.backslashreplace_errors(
    599                         UnicodeTranslateError("a" + s + "b",
    600                                               1, 1 + len(s), "ouch")),
    601                     (r, 1 + len(s))
    602                 )
    603         tests = [
    604             (b"a", "\\x61"),
    605             (b"\n", "\\x0a"),
    606             (b"\x00", "\\x00"),
    607             (b"\xff", "\\xff"),
    608         ]
    609         for b, r in tests:
    610             with self.subTest(bytes=b):
    611                 self.assertEqual(
    612                     codecs.backslashreplace_errors(
    613                         UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
    614                                            1, 2, "ouch")),
    615                     (r, 2)
    616                 )
    617 
    618     def test_badandgoodnamereplaceexceptions(self):
    619         # "namereplace" complains about a non-exception passed in
    620         self.assertRaises(
    621            TypeError,
    622            codecs.namereplace_errors,
    623            42
    624         )
    625         # "namereplace" complains about the wrong exception types
    626         self.assertRaises(
    627            TypeError,
    628            codecs.namereplace_errors,
    629            UnicodeError("ouch")
    630         )
    631         # "namereplace" can only be used for encoding
    632         self.assertRaises(
    633             TypeError,
    634             codecs.namereplace_errors,
    635             UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
    636         )
    637         self.assertRaises(
    638             TypeError,
    639             codecs.namereplace_errors,
    640             UnicodeTranslateError("\u3042", 0, 1, "ouch")
    641         )
    642         # Use the correct exception
    643         tests = [
    644             ("\u3042", "\\N{HIRAGANA LETTER A}"),
    645             ("\x00", "\\x00"),
    646             ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
    647                        "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
    648             ("\U000e007f", "\\N{CANCEL TAG}"),
    649             ("\U0010ffff", "\\U0010ffff"),
    650             # Lone surrogates
    651             ("\ud800", "\\ud800"),
    652             ("\udfff", "\\udfff"),
    653             ("\ud800\udfff", "\\ud800\\udfff"),
    654         ]
    655         for s, r in tests:
    656             with self.subTest(str=s):
    657                 self.assertEqual(
    658                     codecs.namereplace_errors(
    659                         UnicodeEncodeError("ascii", "a" + s + "b",
    660                                            1, 1 + len(s), "ouch")),
    661                     (r, 1 + len(s))
    662                 )
    663 
    664     def test_badandgoodsurrogateescapeexceptions(self):
    665         surrogateescape_errors = codecs.lookup_error('surrogateescape')
    666         # "surrogateescape" complains about a non-exception passed in
    667         self.assertRaises(
    668            TypeError,
    669            surrogateescape_errors,
    670            42
    671         )
    672         # "surrogateescape" complains about the wrong exception types
    673         self.assertRaises(
    674            TypeError,
    675            surrogateescape_errors,
    676            UnicodeError("ouch")
    677         )
    678         # "surrogateescape" can not be used for translating
    679         self.assertRaises(
    680             TypeError,
    681             surrogateescape_errors,
    682             UnicodeTranslateError("\udc80", 0, 1, "ouch")
    683         )
    684         # Use the correct exception
    685         for s in ("a", "\udc7f", "\udd00"):
    686             with self.subTest(str=s):
    687                 self.assertRaises(
    688                     UnicodeEncodeError,
    689                     surrogateescape_errors,
    690                     UnicodeEncodeError("ascii", s, 0, 1, "ouch")
    691                 )
    692         self.assertEqual(
    693             surrogateescape_errors(
    694                 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
    695             (b"\x80", 2)
    696         )
    697         self.assertRaises(
    698             UnicodeDecodeError,
    699             surrogateescape_errors,
    700             UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
    701         )
    702         self.assertEqual(
    703             surrogateescape_errors(
    704                 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
    705             ("\udc80", 2)
    706         )
    707 
    708     def test_badandgoodsurrogatepassexceptions(self):
    709         surrogatepass_errors = codecs.lookup_error('surrogatepass')
    710         # "surrogatepass" complains about a non-exception passed in
    711         self.assertRaises(
    712            TypeError,
    713            surrogatepass_errors,
    714            42
    715         )
    716         # "surrogatepass" complains about the wrong exception types
    717         self.assertRaises(
    718            TypeError,
    719            surrogatepass_errors,
    720            UnicodeError("ouch")
    721         )
    722         # "surrogatepass" can not be used for translating
    723         self.assertRaises(
    724             TypeError,
    725             surrogatepass_errors,
    726             UnicodeTranslateError("\ud800", 0, 1, "ouch")
    727         )
    728         # Use the correct exception
    729         for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
    730             with self.subTest(encoding=enc):
    731                 self.assertRaises(
    732                     UnicodeEncodeError,
    733                     surrogatepass_errors,
    734                     UnicodeEncodeError(enc, "a", 0, 1, "ouch")
    735                 )
    736                 self.assertRaises(
    737                     UnicodeDecodeError,
    738                     surrogatepass_errors,
    739                     UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
    740                 )
    741         for s in ("\ud800", "\udfff", "\ud800\udfff"):
    742             with self.subTest(str=s):
    743                 self.assertRaises(
    744                     UnicodeEncodeError,
    745                     surrogatepass_errors,
    746                     UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
    747                 )
    748         tests = [
    749             ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
    750             ("utf-16le", "\ud800", b'\x00\xd8', 2),
    751             ("utf-16be", "\ud800", b'\xd8\x00', 2),
    752             ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
    753             ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
    754             ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
    755             ("utf-16le", "\udfff", b'\xff\xdf', 2),
    756             ("utf-16be", "\udfff", b'\xdf\xff', 2),
    757             ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
    758             ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
    759             ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
    760             ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
    761             ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
    762             ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
    763             ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
    764         ]
    765         for enc, s, b, n in tests:
    766             with self.subTest(encoding=enc, str=s, bytes=b):
    767                 self.assertEqual(
    768                     surrogatepass_errors(
    769                         UnicodeEncodeError(enc, "a" + s + "b",
    770                                            1, 1 + len(s), "ouch")),
    771                     (b, 1 + len(s))
    772                 )
    773                 self.assertEqual(
    774                     surrogatepass_errors(
    775                         UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
    776                                            1, 1 + n, "ouch")),
    777                     (s[:1], 1 + n)
    778                 )
    779 
    780     def test_badhandlerresults(self):
    781         results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
    782         encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
    783 
    784         for res in results:
    785             codecs.register_error("test.badhandler", lambda x: res)
    786             for enc in encs:
    787                 self.assertRaises(
    788                     TypeError,
    789                     "\u3042".encode,
    790                     enc,
    791                     "test.badhandler"
    792                 )
    793             for (enc, bytes) in (
    794                 ("ascii", b"\xff"),
    795                 ("utf-8", b"\xff"),
    796                 ("utf-7", b"+x-"),
    797                 ("unicode-internal", b"\x00"),
    798             ):
    799                 with test.support.check_warnings():
    800                     # unicode-internal has been deprecated
    801                     self.assertRaises(
    802                         TypeError,
    803                         bytes.decode,
    804                         enc,
    805                         "test.badhandler"
    806                     )
    807 
    808     def test_lookup(self):
    809         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
    810         self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
    811         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
    812         self.assertEqual(
    813             codecs.xmlcharrefreplace_errors,
    814             codecs.lookup_error("xmlcharrefreplace")
    815         )
    816         self.assertEqual(
    817             codecs.backslashreplace_errors,
    818             codecs.lookup_error("backslashreplace")
    819         )
    820         self.assertEqual(
    821             codecs.namereplace_errors,
    822             codecs.lookup_error("namereplace")
    823         )
    824 
    825     def test_unencodablereplacement(self):
    826         def unencrepl(exc):
    827             if isinstance(exc, UnicodeEncodeError):
    828                 return ("\u4242", exc.end)
    829             else:
    830                 raise TypeError("don't know how to handle %r" % exc)
    831         codecs.register_error("test.unencreplhandler", unencrepl)
    832         for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
    833             self.assertRaises(
    834                 UnicodeEncodeError,
    835                 "\u4242".encode,
    836                 enc,
    837                 "test.unencreplhandler"
    838             )
    839 
    840     def test_badregistercall(self):
    841         # enhance coverage of:
    842         # Modules/_codecsmodule.c::register_error()
    843         # Python/codecs.c::PyCodec_RegisterError()
    844         self.assertRaises(TypeError, codecs.register_error, 42)
    845         self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
    846 
    847     def test_badlookupcall(self):
    848         # enhance coverage of:
    849         # Modules/_codecsmodule.c::lookup_error()
    850         self.assertRaises(TypeError, codecs.lookup_error)
    851 
    852     def test_unknownhandler(self):
    853         # enhance coverage of:
    854         # Modules/_codecsmodule.c::lookup_error()
    855         self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
    856 
    857     def test_xmlcharrefvalues(self):
    858         # enhance coverage of:
    859         # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
    860         # and inline implementations
    861         v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
    862              500000, 1000000)
    863         s = "".join([chr(x) for x in v])
    864         codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
    865         for enc in ("ascii", "iso-8859-15"):
    866             for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
    867                 s.encode(enc, err)
    868 
    869     def test_decodehelper(self):
    870         # enhance coverage of:
    871         # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
    872         # and callers
    873         self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
    874 
    875         def baddecodereturn1(exc):
    876             return 42
    877         codecs.register_error("test.baddecodereturn1", baddecodereturn1)
    878         self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
    879         self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
    880         self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
    881         self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
    882         self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
    883         self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
    884 
    885         def baddecodereturn2(exc):
    886             return ("?", None)
    887         codecs.register_error("test.baddecodereturn2", baddecodereturn2)
    888         self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
    889 
    890         handler = PosReturn()
    891         codecs.register_error("test.posreturn", handler.handle)
    892 
    893         # Valid negative position
    894         handler.pos = -1
    895         self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
    896 
    897         # Valid negative position
    898         handler.pos = -2
    899         self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
    900 
    901         # Negative position out of bounds
    902         handler.pos = -3
    903         self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
    904 
    905         # Valid positive position
    906         handler.pos = 1
    907         self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
    908 
    909         # Largest valid positive position (one beyond end of input)
    910         handler.pos = 2
    911         self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
    912 
    913         # Invalid positive position
    914         handler.pos = 3
    915         self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
    916 
    917         # Restart at the "0"
    918         handler.pos = 6
    919         self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
    920 
    921         class D(dict):
    922             def __getitem__(self, key):
    923                 raise ValueError
    924         self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
    925         self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
    926         self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
    927 
    928     def test_encodehelper(self):
    929         # enhance coverage of:
    930         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
    931         # and callers
    932         self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
    933 
    934         def badencodereturn1(exc):
    935             return 42
    936         codecs.register_error("test.badencodereturn1", badencodereturn1)
    937         self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
    938 
    939         def badencodereturn2(exc):
    940             return ("?", None)
    941         codecs.register_error("test.badencodereturn2", badencodereturn2)
    942         self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
    943 
    944         handler = PosReturn()
    945         codecs.register_error("test.posreturn", handler.handle)
    946 
    947         # Valid negative position
    948         handler.pos = -1
    949         self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
    950 
    951         # Valid negative position
    952         handler.pos = -2
    953         self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
    954 
    955         # Negative position out of bounds
    956         handler.pos = -3
    957         self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
    958 
    959         # Valid positive position
    960         handler.pos = 1
    961         self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
    962 
    963         # Largest valid positive position (one beyond end of input
    964         handler.pos = 2
    965         self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
    966 
    967         # Invalid positive position
    968         handler.pos = 3
    969         self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
    970 
    971         handler.pos = 0
    972 
    973         class D(dict):
    974             def __getitem__(self, key):
    975                 raise ValueError
    976         for err in ("strict", "replace", "xmlcharrefreplace",
    977                     "backslashreplace", "namereplace", "test.posreturn"):
    978             self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
    979             self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
    980             self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
    981 
    982     def test_translatehelper(self):
    983         # enhance coverage of:
    984         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
    985         # and callers
    986         # (Unfortunately the errors argument is not directly accessible
    987         # from Python, so we can't test that much)
    988         class D(dict):
    989             def __getitem__(self, key):
    990                 raise ValueError
    991         #self.assertRaises(ValueError, "\xff".translate, D())
    992         self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
    993         self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
    994 
    995     def test_bug828737(self):
    996         charmap = {
    997             ord("&"): "&amp;",
    998             ord("<"): "&lt;",
    999             ord(">"): "&gt;",
   1000             ord('"'): "&quot;",
   1001         }
   1002 
   1003         for n in (1, 10, 100, 1000):
   1004             text = 'abc<def>ghi'*n
   1005             text.translate(charmap)
   1006 
   1007     def test_mutatingdecodehandler(self):
   1008         baddata = [
   1009             ("ascii", b"\xff"),
   1010             ("utf-7", b"++"),
   1011             ("utf-8",  b"\xff"),
   1012             ("utf-16", b"\xff"),
   1013             ("utf-32", b"\xff"),
   1014             ("unicode-escape", b"\\u123g"),
   1015             ("raw-unicode-escape", b"\\u123g"),
   1016             ("unicode-internal", b"\xff"),
   1017         ]
   1018 
   1019         def replacing(exc):
   1020             if isinstance(exc, UnicodeDecodeError):
   1021                 exc.object = 42
   1022                 return ("\u4242", 0)
   1023             else:
   1024                 raise TypeError("don't know how to handle %r" % exc)
   1025         codecs.register_error("test.replacing", replacing)
   1026 
   1027         with test.support.check_warnings():
   1028             # unicode-internal has been deprecated
   1029             for (encoding, data) in baddata:
   1030                 with self.assertRaises(TypeError):
   1031                     data.decode(encoding, "test.replacing")
   1032 
   1033         def mutating(exc):
   1034             if isinstance(exc, UnicodeDecodeError):
   1035                 exc.object[:] = b""
   1036                 return ("\u4242", 0)
   1037             else:
   1038                 raise TypeError("don't know how to handle %r" % exc)
   1039         codecs.register_error("test.mutating", mutating)
   1040         # If the decoder doesn't pick up the modified input the following
   1041         # will lead to an endless loop
   1042         with test.support.check_warnings():
   1043             # unicode-internal has been deprecated
   1044             for (encoding, data) in baddata:
   1045                 with self.assertRaises(TypeError):
   1046                     data.decode(encoding, "test.replacing")
   1047 
   1048     def test_fake_error_class(self):
   1049         handlers = [
   1050             codecs.strict_errors,
   1051             codecs.ignore_errors,
   1052             codecs.replace_errors,
   1053             codecs.backslashreplace_errors,
   1054             codecs.namereplace_errors,
   1055             codecs.xmlcharrefreplace_errors,
   1056             codecs.lookup_error('surrogateescape'),
   1057             codecs.lookup_error('surrogatepass'),
   1058         ]
   1059         for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
   1060             class FakeUnicodeError(str):
   1061                 __class__ = cls
   1062             for handler in handlers:
   1063                 with self.subTest(handler=handler, error_class=cls):
   1064                     self.assertRaises(TypeError, handler, FakeUnicodeError())
   1065             class FakeUnicodeError(Exception):
   1066                 __class__ = cls
   1067             for handler in handlers:
   1068                 with self.subTest(handler=handler, error_class=cls):
   1069                     with self.assertRaises((TypeError, FakeUnicodeError)):
   1070                         handler(FakeUnicodeError())
   1071 
   1072 
   1073 if __name__ == "__main__":
   1074     unittest.main()
   1075