Home | History | Annotate | Download | only in test
      1 import test.test_support, unittest
      2 import sys, codecs, htmlentitydefs, unicodedata
      3 
      4 class PosReturn:
      5     # this can be used for configurable callbacks
      6 
      7     def __init__(self):
      8         self.pos = 0
      9 
     10     def handle(self, exc):
     11         oldpos = self.pos
     12         realpos = oldpos
     13         if realpos<0:
     14             realpos = len(exc.object) + realpos
     15         # if we don't advance this time, terminate on the next call
     16         # otherwise we'd get an endless loop
     17         if realpos <= exc.start:
     18             self.pos = len(exc.object)
     19         return (u"<?>", oldpos)
     20 
     21 # A UnicodeEncodeError object with a bad start attribute
     22 class BadStartUnicodeEncodeError(UnicodeEncodeError):
     23     def __init__(self):
     24         UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
     25         self.start = []
     26 
     27 # A UnicodeEncodeError object with a bad object attribute
     28 class BadObjectUnicodeEncodeError(UnicodeEncodeError):
     29     def __init__(self):
     30         UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
     31         self.object = []
     32 
     33 # A UnicodeDecodeError object without an end attribute
     34 class NoEndUnicodeDecodeError(UnicodeDecodeError):
     35     def __init__(self):
     36         UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
     37         del self.end
     38 
     39 # A UnicodeDecodeError object with a bad object attribute
     40 class BadObjectUnicodeDecodeError(UnicodeDecodeError):
     41     def __init__(self):
     42         UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
     43         self.object = []
     44 
     45 # A UnicodeTranslateError object without a start attribute
     46 class NoStartUnicodeTranslateError(UnicodeTranslateError):
     47     def __init__(self):
     48         UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
     49         del self.start
     50 
     51 # A UnicodeTranslateError object without an end attribute
     52 class NoEndUnicodeTranslateError(UnicodeTranslateError):
     53     def __init__(self):
     54         UnicodeTranslateError.__init__(self,  u"", 0, 1, "bad")
     55         del self.end
     56 
     57 # A UnicodeTranslateError object without an object attribute
     58 class NoObjectUnicodeTranslateError(UnicodeTranslateError):
     59     def __init__(self):
     60         UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
     61         del self.object
     62 
     63 class CodecCallbackTest(unittest.TestCase):
     64 
     65     def test_xmlcharrefreplace(self):
     66         # replace unencodable characters which numeric character entities.
     67         # For ascii, latin-1 and charmaps this is completely implemented
     68         # in C and should be reasonably fast.
     69         s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
     70         self.assertEqual(
     71             s.encode("ascii", "xmlcharrefreplace"),
     72             "&#12473;&#12497;&#12514; &#228;nd egg&#353;"
     73         )
     74         self.assertEqual(
     75             s.encode("latin-1", "xmlcharrefreplace"),
     76             "&#12473;&#12497;&#12514; \xe4nd egg&#353;"
     77         )
     78         self.assertEqual(
     79             s.encode("iso-8859-15", "xmlcharrefreplace"),
     80             "&#12473;&#12497;&#12514; \xe4nd egg\xa8"
     81         )
     82 
     83     def test_xmlcharrefreplace_with_surrogates(self):
     84         tests = [(u'\U0001f49d', '&#128157;'),
     85                  (u'\ud83d', '&#55357;'),
     86                  (u'\udc9d', '&#56477;'),
     87                 ]
     88         if u'\ud83d\udc9d' != u'\U0001f49d':
     89             tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
     90         for encoding in ['ascii', 'latin1', 'iso-8859-15']:
     91             for s, exp in tests:
     92                 self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
     93                                  exp, msg='%r.encode(%r)' % (s, encoding))
     94                 self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
     95                                  exp+'X',
     96                                  msg='%r.encode(%r)' % (s + 'X', encoding))
     97 
     98     def test_xmlcharnamereplace(self):
     99         # This time use a named character entity for unencodable
    100         # characters, if one is available.
    101 
    102         def xmlcharnamereplace(exc):
    103             if not isinstance(exc, UnicodeEncodeError):
    104                 raise TypeError("don't know how to handle %r" % exc)
    105             l = []
    106             for c in exc.object[exc.start:exc.end]:
    107                 try:
    108                     l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
    109                 except KeyError:
    110                     l.append(u"&#%d;" % ord(c))
    111             return (u"".join(l), exc.end)
    112 
    113         codecs.register_error(
    114             "test.xmlcharnamereplace", xmlcharnamereplace)
    115 
    116         sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
    117         sout = "&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
    118         self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
    119         sout = "\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
    120         self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
    121         sout = "\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
    122         self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
    123 
    124     def test_uninamereplace(self):
    125         # We're using the names from the unicode database this time,
    126         # and we're doing "syntax highlighting" here, i.e. we include
    127         # the replaced text in ANSI escape sequences. For this it is
    128         # useful that the error handler is not called for every single
    129         # unencodable character, but for a complete sequence of
    130         # unencodable characters, otherwise we would output many
    131         # unnecessary escape sequences.
    132 
    133         def uninamereplace(exc):
    134             if not isinstance(exc, UnicodeEncodeError):
    135                 raise TypeError("don't know how to handle %r" % exc)
    136             l = []
    137             for c in exc.object[exc.start:exc.end]:
    138                 l.append(unicodedata.name(c, u"0x%x" % ord(c)))
    139             return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
    140 
    141         codecs.register_error(
    142             "test.uninamereplace", uninamereplace)
    143 
    144         sin = u"\xac\u1234\u20ac\u8000"
    145         sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
    146         self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
    147 
    148         sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
    149         self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
    150 
    151         sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
    152         self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
    153 
    154     def test_backslashescape(self):
    155         # Does the same as the "unicode-escape" encoding, but with different
    156         # base encodings.
    157         sin = u"a\xac\u1234\u20ac\u8000"
    158         if sys.maxunicode > 0xffff:
    159             sin += unichr(sys.maxunicode)
    160         sout = "a\\xac\\u1234\\u20ac\\u8000"
    161         if sys.maxunicode > 0xffff:
    162             sout += "\\U%08x" % sys.maxunicode
    163         self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
    164 
    165         sout = "a\xac\\u1234\\u20ac\\u8000"
    166         if sys.maxunicode > 0xffff:
    167             sout += "\\U%08x" % sys.maxunicode
    168         self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
    169 
    170         sout = "a\xac\\u1234\xa4\\u8000"
    171         if sys.maxunicode > 0xffff:
    172             sout += "\\U%08x" % sys.maxunicode
    173         self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
    174 
    175     def test_decoding_callbacks(self):
    176         # This is a test for a decoding callback handler
    177         # that allows the decoding of the invalid sequence
    178         # "\xc0\x80" and returns "\x00" instead of raising an error.
    179         # All other illegal sequences will be handled strictly.
    180         def relaxedutf8(exc):
    181             if not isinstance(exc, UnicodeDecodeError):
    182                 raise TypeError("don't know how to handle %r" % exc)
    183             if exc.object[exc.start:exc.start+2] == "\xc0\x80":
    184                 return (u"\x00", exc.start+2) # retry after two bytes
    185             else:
    186                 raise exc
    187 
    188         codecs.register_error("test.relaxedutf8", relaxedutf8)
    189 
    190         # all the "\xc0\x80" will be decoded to "\x00"
    191         sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
    192         sout = u"a\x00b\x00c\xfc\x00\x00"
    193         self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
    194 
    195         # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
    196         sin = "\xc0\x80\xc0\x81"
    197         self.assertRaises(UnicodeDecodeError, sin.decode,
    198                           "utf-8", "test.relaxedutf8")
    199 
    200     def test_charmapencode(self):
    201         # For charmap encodings the replacement string will be
    202         # mapped through the encoding again. This means, that
    203         # to be able to use e.g. the "replace" handler, the
    204         # charmap has to have a mapping for "?".
    205         charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
    206         sin = u"abc"
    207         sout = "AABBCC"
    208         self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
    209 
    210         sin = u"abcA"
    211         self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
    212 
    213         charmap[ord("?")] = "XYZ"
    214         sin = u"abcDEF"
    215         sout = "AABBCCXYZXYZXYZ"
    216         self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
    217 
    218         charmap[ord("?")] = u"XYZ"
    219         self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
    220 
    221         charmap[ord("?")] = u"XYZ"
    222         self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
    223 
    224     def test_decodeunicodeinternal(self):
    225         self.assertRaises(
    226             UnicodeDecodeError,
    227             "\x00\x00\x00\x00\x00".decode,
    228             "unicode-internal",
    229         )
    230         if sys.maxunicode > 0xffff:
    231             def handler_unicodeinternal(exc):
    232                 if not isinstance(exc, UnicodeDecodeError):
    233                     raise TypeError("don't know how to handle %r" % exc)
    234                 return (u"\x01", 1)
    235 
    236             self.assertEqual(
    237                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
    238                 u"\u0000"
    239             )
    240 
    241             self.assertEqual(
    242                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
    243                 u"\u0000\ufffd"
    244             )
    245 
    246             codecs.register_error("test.hui", handler_unicodeinternal)
    247 
    248             self.assertEqual(
    249                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
    250                 u"\u0000\u0001\u0000"
    251             )
    252 
    253     def test_callbacks(self):
    254         def handler1(exc):
    255             if not isinstance(exc, UnicodeEncodeError) \
    256                and not isinstance(exc, UnicodeDecodeError):
    257                 raise TypeError("don't know how to handle %r" % exc)
    258             l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
    259             return (u"[%s]" % u"".join(l), exc.end)
    260 
    261         codecs.register_error("test.handler1", handler1)
    262 
    263         def handler2(exc):
    264             if not isinstance(exc, UnicodeDecodeError):
    265                 raise TypeError("don't know how to handle %r" % exc)
    266             l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
    267             return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
    268 
    269         codecs.register_error("test.handler2", handler2)
    270 
    271         s = "\x00\x81\x7f\x80\xff"
    272 
    273         self.assertEqual(
    274             s.decode("ascii", "test.handler1"),
    275             u"\x00[<129>]\x7f[<128>][<255>]"
    276         )
    277         self.assertEqual(
    278             s.decode("ascii", "test.handler2"),
    279             u"\x00[<129>][<128>]"
    280         )
    281 
    282         self.assertEqual(
    283             "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
    284             u"\u3042[<92><117><51>]xxx"
    285         )
    286 
    287         self.assertEqual(
    288             "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
    289             u"\u3042[<92><117><51>]xx"
    290         )
    291 
    292         self.assertEqual(
    293             codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
    294             u"z[<98>][<99>]"
    295         )
    296 
    297         self.assertEqual(
    298             u"g\xfc\xdfrk".encode("ascii", "test.handler1"),
    299             u"g[<252><223>]rk"
    300         )
    301 
    302         self.assertEqual(
    303             u"g\xfc\xdf".encode("ascii", "test.handler1"),
    304             u"g[<252><223>]"
    305         )
    306 
    307     def test_longstrings(self):
    308         # test long strings to check for memory overflow problems
    309         errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
    310                    "backslashreplace"]
    311         # register the handlers under different names,
    312         # to prevent the codec from recognizing the name
    313         for err in errors:
    314             codecs.register_error("test." + err, codecs.lookup_error(err))
    315         l = 1000
    316         errors += [ "test." + err for err in errors ]
    317         for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
    318             for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
    319                         "utf-8", "utf-7", "utf-16", "utf-32"):
    320                 for err in errors:
    321                     try:
    322                         uni.encode(enc, err)
    323                     except UnicodeError:
    324                         pass
    325 
    326     def check_exceptionobjectargs(self, exctype, args, msg):
    327         # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
    328         # check with one missing argument
    329         self.assertRaises(TypeError, exctype, *args[:-1])
    330         # check with one argument too much
    331         self.assertRaises(TypeError, exctype, *(args + ["too much"]))
    332         # check with one argument of the wrong type
    333         wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
    334         for i in xrange(len(args)):
    335             for wrongarg in wrongargs:
    336                 if type(wrongarg) is type(args[i]):
    337                     continue
    338                 # build argument array
    339                 callargs = []
    340                 for j in xrange(len(args)):
    341                     if i==j:
    342                         callargs.append(wrongarg)
    343                     else:
    344                         callargs.append(args[i])
    345                 self.assertRaises(TypeError, exctype, *callargs)
    346 
    347         # check with the correct number and type of arguments
    348         exc = exctype(*args)
    349         self.assertEqual(str(exc), msg)
    350 
    351     def test_unicodeencodeerror(self):
    352         self.check_exceptionobjectargs(
    353             UnicodeEncodeError,
    354             ["ascii", u"g\xfcrk", 1, 2, "ouch"],
    355             "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
    356         )
    357         self.check_exceptionobjectargs(
    358             UnicodeEncodeError,
    359             ["ascii", u"g\xfcrk", 1, 4, "ouch"],
    360             "'ascii' codec can't encode characters in position 1-3: ouch"
    361         )
    362         self.check_exceptionobjectargs(
    363             UnicodeEncodeError,
    364             ["ascii", u"\xfcx", 0, 1, "ouch"],
    365             "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
    366         )
    367         self.check_exceptionobjectargs(
    368             UnicodeEncodeError,
    369             ["ascii", u"\u0100x", 0, 1, "ouch"],
    370             "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
    371         )
    372         self.check_exceptionobjectargs(
    373             UnicodeEncodeError,
    374             ["ascii", u"\uffffx", 0, 1, "ouch"],
    375             "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
    376         )
    377         if sys.maxunicode > 0xffff:
    378             self.check_exceptionobjectargs(
    379                 UnicodeEncodeError,
    380                 ["ascii", u"\U00010000x", 0, 1, "ouch"],
    381                 "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
    382             )
    383 
    384     def test_unicodedecodeerror(self):
    385         self.check_exceptionobjectargs(
    386             UnicodeDecodeError,
    387             ["ascii", "g\xfcrk", 1, 2, "ouch"],
    388             "'ascii' codec can't decode byte 0xfc in position 1: ouch"
    389         )
    390         self.check_exceptionobjectargs(
    391             UnicodeDecodeError,
    392             ["ascii", "g\xfcrk", 1, 3, "ouch"],
    393             "'ascii' codec can't decode bytes in position 1-2: ouch"
    394         )
    395 
    396     def test_unicodetranslateerror(self):
    397         self.check_exceptionobjectargs(
    398             UnicodeTranslateError,
    399             [u"g\xfcrk", 1, 2, "ouch"],
    400             "can't translate character u'\\xfc' in position 1: ouch"
    401         )
    402         self.check_exceptionobjectargs(
    403             UnicodeTranslateError,
    404             [u"g\u0100rk", 1, 2, "ouch"],
    405             "can't translate character u'\\u0100' in position 1: ouch"
    406         )
    407         self.check_exceptionobjectargs(
    408             UnicodeTranslateError,
    409             [u"g\uffffrk", 1, 2, "ouch"],
    410             "can't translate character u'\\uffff' in position 1: ouch"
    411         )
    412         if sys.maxunicode > 0xffff:
    413             self.check_exceptionobjectargs(
    414                 UnicodeTranslateError,
    415                 [u"g\U00010000rk", 1, 2, "ouch"],
    416                 "can't translate character u'\\U00010000' in position 1: ouch"
    417             )
    418         self.check_exceptionobjectargs(
    419             UnicodeTranslateError,
    420             [u"g\xfcrk", 1, 3, "ouch"],
    421             "can't translate characters in position 1-2: ouch"
    422         )
    423 
    424     def test_badandgoodstrictexceptions(self):
    425         # "strict" complains about a non-exception passed in
    426         self.assertRaises(
    427             TypeError,
    428             codecs.strict_errors,
    429             42
    430         )
    431         # "strict" complains about the wrong exception type
    432         self.assertRaises(
    433             Exception,
    434             codecs.strict_errors,
    435             Exception("ouch")
    436         )
    437 
    438         # If the correct exception is passed in, "strict" raises it
    439         self.assertRaises(
    440             UnicodeEncodeError,
    441             codecs.strict_errors,
    442             UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
    443         )
    444         self.assertRaises(
    445             UnicodeDecodeError,
    446             codecs.strict_errors,
    447             UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
    448         )
    449         self.assertRaises(
    450             UnicodeTranslateError,
    451             codecs.strict_errors,
    452             UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
    453         )
    454 
    455     def test_badandgoodignoreexceptions(self):
    456         # "ignore" complains about a non-exception passed in
    457         self.assertRaises(
    458            TypeError,
    459            codecs.ignore_errors,
    460            42
    461         )
    462         # "ignore" complains about the wrong exception type
    463         self.assertRaises(
    464            TypeError,
    465            codecs.ignore_errors,
    466            UnicodeError("ouch")
    467         )
    468         # If the correct exception is passed in, "ignore" returns an empty replacement
    469         self.assertEqual(
    470             codecs.ignore_errors(
    471                 UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
    472             (u"", 2)
    473         )
    474         self.assertEqual(
    475             codecs.ignore_errors(
    476                 UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
    477             (u"", 2)
    478         )
    479         self.assertEqual(
    480             codecs.ignore_errors(
    481                 UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
    482             (u"", 2)
    483         )
    484 
    485     def test_badandgoodreplaceexceptions(self):
    486         # "replace" complains about a non-exception passed in
    487         self.assertRaises(
    488            TypeError,
    489            codecs.replace_errors,
    490            42
    491         )
    492         # "replace" complains about the wrong exception type
    493         self.assertRaises(
    494            TypeError,
    495            codecs.replace_errors,
    496            UnicodeError("ouch")
    497         )
    498         self.assertRaises(
    499             TypeError,
    500             codecs.replace_errors,
    501             BadObjectUnicodeEncodeError()
    502         )
    503         self.assertRaises(
    504             TypeError,
    505             codecs.replace_errors,
    506             BadObjectUnicodeDecodeError()
    507         )
    508         # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
    509         self.assertEqual(
    510             codecs.replace_errors(
    511                 UnicodeEncodeError("ascii", u"a\u3042b", 1, 2, "ouch")),
    512             (u"?", 2)
    513         )
    514         self.assertEqual(
    515             codecs.replace_errors(
    516                 UnicodeDecodeError("ascii", "a\xffb", 1, 2, "ouch")),
    517             (u"\ufffd", 2)
    518         )
    519         self.assertEqual(
    520             codecs.replace_errors(
    521                 UnicodeTranslateError(u"a\u3042b", 1, 2, "ouch")),
    522             (u"\ufffd", 2)
    523         )
    524 
    525     def test_badandgoodxmlcharrefreplaceexceptions(self):
    526         # "xmlcharrefreplace" complains about a non-exception passed in
    527         self.assertRaises(
    528            TypeError,
    529            codecs.xmlcharrefreplace_errors,
    530            42
    531         )
    532         # "xmlcharrefreplace" complains about the wrong exception types
    533         self.assertRaises(
    534            TypeError,
    535            codecs.xmlcharrefreplace_errors,
    536            UnicodeError("ouch")
    537         )
    538         # "xmlcharrefreplace" can only be used for encoding
    539         self.assertRaises(
    540             TypeError,
    541             codecs.xmlcharrefreplace_errors,
    542             UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
    543         )
    544         self.assertRaises(
    545             TypeError,
    546             codecs.xmlcharrefreplace_errors,
    547             UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
    548         )
    549         # Use the correct exception
    550         cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000)
    551         cs += (0xdfff, 0xd800)
    552         s = u"".join(unichr(c) for c in cs)
    553         s += u"\U0001869f\U000186a0\U000f423f\U000f4240"
    554         cs += (99999, 100000, 999999, 1000000)
    555         self.assertEqual(
    556             codecs.xmlcharrefreplace_errors(
    557                 UnicodeEncodeError("ascii", u"a" + s + u"b",
    558                                    1, 1 + len(s), "ouch")
    559             ),
    560             (u"".join(u"&#%d;" % c for c in cs), 1 + len(s))
    561         )
    562 
    563     def test_badandgoodbackslashreplaceexceptions(self):
    564         # "backslashreplace" complains about a non-exception passed in
    565         self.assertRaises(
    566            TypeError,
    567            codecs.backslashreplace_errors,
    568            42
    569         )
    570         # "backslashreplace" complains about the wrong exception types
    571         self.assertRaises(
    572            TypeError,
    573            codecs.backslashreplace_errors,
    574            UnicodeError("ouch")
    575         )
    576         # "backslashreplace" can only be used for encoding
    577         self.assertRaises(
    578             TypeError,
    579             codecs.backslashreplace_errors,
    580             UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
    581         )
    582         self.assertRaises(
    583             TypeError,
    584             codecs.backslashreplace_errors,
    585             UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
    586         )
    587         # Use the correct exception
    588         tests = [
    589             (u"\u3042", u"\\u3042"),
    590             (u"\n", u"\\x0a"),
    591             (u"a", u"\\x61"),
    592             (u"\x00", u"\\x00"),
    593             (u"\xff", u"\\xff"),
    594             (u"\u0100", u"\\u0100"),
    595             (u"\uffff", u"\\uffff"),
    596             # Lone surrogates
    597             (u"\ud800", u"\\ud800"),
    598             (u"\udfff", u"\\udfff"),
    599         ]
    600         if sys.maxunicode > 0xffff:
    601             tests += [
    602                 (u"\U00010000", u"\\U00010000"),
    603                 (u"\U0010ffff", u"\\U0010ffff"),
    604             ]
    605         else:
    606             tests += [
    607                 (u"\U00010000", u"\\ud800\\udc00"),
    608                 (u"\U0010ffff", u"\\udbff\\udfff"),
    609             ]
    610         for s, r in tests:
    611             self.assertEqual(
    612                 codecs.backslashreplace_errors(
    613                     UnicodeEncodeError("ascii", u"a" + s + u"b",
    614                                        1, 1 + len(s), "ouch")),
    615                 (r, 1 + len(s))
    616             )
    617 
    618     def test_badhandlerresults(self):
    619         results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
    620         encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
    621 
    622         for res in results:
    623             codecs.register_error("test.badhandler", lambda x: res)
    624             for enc in encs:
    625                 self.assertRaises(
    626                     TypeError,
    627                     u"\u3042".encode,
    628                     enc,
    629                     "test.badhandler"
    630                 )
    631             for (enc, bytes) in (
    632                 ("ascii", "\xff"),
    633                 ("utf-8", "\xff"),
    634                 ("utf-7", "+x-"),
    635                 ("unicode-internal", "\x00"),
    636             ):
    637                 self.assertRaises(
    638                     TypeError,
    639                     bytes.decode,
    640                     enc,
    641                     "test.badhandler"
    642                 )
    643 
    644     def test_lookup(self):
    645         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
    646         self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
    647         self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
    648         self.assertEqual(
    649             codecs.xmlcharrefreplace_errors,
    650             codecs.lookup_error("xmlcharrefreplace")
    651         )
    652         self.assertEqual(
    653             codecs.backslashreplace_errors,
    654             codecs.lookup_error("backslashreplace")
    655         )
    656 
    657     def test_unencodablereplacement(self):
    658         def unencrepl(exc):
    659             if isinstance(exc, UnicodeEncodeError):
    660                 return (u"\u4242", exc.end)
    661             else:
    662                 raise TypeError("don't know how to handle %r" % exc)
    663         codecs.register_error("test.unencreplhandler", unencrepl)
    664         for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
    665             self.assertRaises(
    666                 UnicodeEncodeError,
    667                 u"\u4242".encode,
    668                 enc,
    669                 "test.unencreplhandler"
    670             )
    671 
    672     def test_badregistercall(self):
    673         # enhance coverage of:
    674         # Modules/_codecsmodule.c::register_error()
    675         # Python/codecs.c::PyCodec_RegisterError()
    676         self.assertRaises(TypeError, codecs.register_error, 42)
    677         self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
    678 
    679     def test_badlookupcall(self):
    680         # enhance coverage of:
    681         # Modules/_codecsmodule.c::lookup_error()
    682         self.assertRaises(TypeError, codecs.lookup_error)
    683 
    684     def test_unknownhandler(self):
    685         # enhance coverage of:
    686         # Modules/_codecsmodule.c::lookup_error()
    687         self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
    688 
    689     def test_xmlcharrefvalues(self):
    690         # enhance coverage of:
    691         # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
    692         # and inline implementations
    693         v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
    694         if sys.maxunicode>=100000:
    695             v += (100000, 500000, 1000000)
    696         s = u"".join([unichr(x) for x in v])
    697         codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
    698         for enc in ("ascii", "iso-8859-15"):
    699             for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
    700                 s.encode(enc, err)
    701 
    702     def test_decodehelper(self):
    703         # enhance coverage of:
    704         # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
    705         # and callers
    706         self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown")
    707 
    708         def baddecodereturn1(exc):
    709             return 42
    710         codecs.register_error("test.baddecodereturn1", baddecodereturn1)
    711         self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
    712         self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
    713         self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
    714         self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
    715         self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
    716         self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
    717 
    718         def baddecodereturn2(exc):
    719             return (u"?", None)
    720         codecs.register_error("test.baddecodereturn2", baddecodereturn2)
    721         self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
    722 
    723         handler = PosReturn()
    724         codecs.register_error("test.posreturn", handler.handle)
    725 
    726         # Valid negative position
    727         handler.pos = -1
    728         self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
    729 
    730         # Valid negative position
    731         handler.pos = -2
    732         self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
    733 
    734         # Negative position out of bounds
    735         handler.pos = -3
    736         self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
    737 
    738         # Valid positive position
    739         handler.pos = 1
    740         self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
    741 
    742         # Largest valid positive position (one beyond end of input)
    743         handler.pos = 2
    744         self.assertEqual("\xff0".decode("ascii", "test.posreturn"), u"<?>")
    745 
    746         # Invalid positive position
    747         handler.pos = 3
    748         self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
    749 
    750         # Restart at the "0"
    751         handler.pos = 6
    752         self.assertEqual("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
    753 
    754         class D(dict):
    755             def __getitem__(self, key):
    756                 raise ValueError
    757         self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
    758         self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
    759         self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: 0x110000})
    760 
    761     def test_encodehelper(self):
    762         # enhance coverage of:
    763         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
    764         # and callers
    765         self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown")
    766 
    767         def badencodereturn1(exc):
    768             return 42
    769         codecs.register_error("test.badencodereturn1", badencodereturn1)
    770         self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1")
    771 
    772         def badencodereturn2(exc):
    773             return (u"?", None)
    774         codecs.register_error("test.badencodereturn2", badencodereturn2)
    775         self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
    776 
    777         handler = PosReturn()
    778         codecs.register_error("test.posreturn", handler.handle)
    779 
    780         # Valid negative position
    781         handler.pos = -1
    782         self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
    783 
    784         # Valid negative position
    785         handler.pos = -2
    786         self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
    787 
    788         # Negative position out of bounds
    789         handler.pos = -3
    790         self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
    791 
    792         # Valid positive position
    793         handler.pos = 1
    794         self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
    795 
    796         # Largest valid positive position (one beyond end of input
    797         handler.pos = 2
    798         self.assertEqual(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
    799 
    800         # Invalid positive position
    801         handler.pos = 3
    802         self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
    803 
    804         handler.pos = 0
    805 
    806         class D(dict):
    807             def __getitem__(self, key):
    808                 raise ValueError
    809         for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
    810             self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
    811             self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
    812             self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
    813 
    814     def test_translatehelper(self):
    815         # enhance coverage of:
    816         # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
    817         # and callers
    818         # (Unfortunately the errors argument is not directly accessible
    819         # from Python, so we can't test that much)
    820         class D(dict):
    821             def __getitem__(self, key):
    822                 raise ValueError
    823         self.assertRaises(ValueError, u"\xff".translate, D())
    824         self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
    825         self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
    826 
    827     def test_bug828737(self):
    828         charmap = {
    829             ord("&"): u"&amp;",
    830             ord("<"): u"&lt;",
    831             ord(">"): u"&gt;",
    832             ord('"'): u"&quot;",
    833         }
    834 
    835         for n in (1, 10, 100, 1000):
    836             text = u'abc<def>ghi'*n
    837             text.translate(charmap)
    838 
    839     def test_fake_error_class(self):
    840         handlers = [
    841             codecs.strict_errors,
    842             codecs.ignore_errors,
    843             codecs.replace_errors,
    844             codecs.backslashreplace_errors,
    845             codecs.xmlcharrefreplace_errors,
    846         ]
    847         for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
    848             class FakeUnicodeError(str):
    849                 __class__ = cls
    850             for handler in handlers:
    851                 self.assertRaises(TypeError, handler, FakeUnicodeError())
    852             class FakeUnicodeError(Exception):
    853                 __class__ = cls
    854             for handler in handlers:
    855                 with self.assertRaises((TypeError, FakeUnicodeError)):
    856                     handler(FakeUnicodeError())
    857 
    858 
    859 def test_main():
    860     test.test_support.run_unittest(CodecCallbackTest)
    861 
    862 if __name__ == "__main__":
    863     test_main()
    864