1 import codecs 2 import contextlib 3 import io 4 import locale 5 import sys 6 import unittest 7 import encodings 8 from unittest import mock 9 10 from test import support 11 12 try: 13 import ctypes 14 except ImportError: 15 ctypes = None 16 SIZEOF_WCHAR_T = -1 17 else: 18 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) 19 20 def coding_checker(self, coder): 21 def check(input, expect): 22 self.assertEqual(coder(input), (expect, len(input))) 23 return check 24 25 26 class Queue(object): 27 """ 28 queue: write bytes at one end, read bytes from the other end 29 """ 30 def __init__(self, buffer): 31 self._buffer = buffer 32 33 def write(self, chars): 34 self._buffer += chars 35 36 def read(self, size=-1): 37 if size<0: 38 s = self._buffer 39 self._buffer = self._buffer[:0] # make empty 40 return s 41 else: 42 s = self._buffer[:size] 43 self._buffer = self._buffer[size:] 44 return s 45 46 47 class MixInCheckStateHandling: 48 def check_state_handling_decode(self, encoding, u, s): 49 for i in range(len(s)+1): 50 d = codecs.getincrementaldecoder(encoding)() 51 part1 = d.decode(s[:i]) 52 state = d.getstate() 53 self.assertIsInstance(state[1], int) 54 # Check that the condition stated in the documentation for 55 # IncrementalDecoder.getstate() holds 56 if not state[1]: 57 # reset decoder to the default state without anything buffered 58 d.setstate((state[0][:0], 0)) 59 # Feeding the previous input may not produce any output 60 self.assertTrue(not d.decode(state[0])) 61 # The decoder must return to the same state 62 self.assertEqual(state, d.getstate()) 63 # Create a new decoder and set it to the state 64 # we extracted from the old one 65 d = codecs.getincrementaldecoder(encoding)() 66 d.setstate(state) 67 part2 = d.decode(s[i:], True) 68 self.assertEqual(u, part1+part2) 69 70 def check_state_handling_encode(self, encoding, u, s): 71 for i in range(len(u)+1): 72 d = codecs.getincrementalencoder(encoding)() 73 part1 = d.encode(u[:i]) 74 state = d.getstate() 75 d = codecs.getincrementalencoder(encoding)() 76 d.setstate(state) 77 part2 = d.encode(u[i:], True) 78 self.assertEqual(s, part1+part2) 79 80 81 class ReadTest(MixInCheckStateHandling): 82 def check_partial(self, input, partialresults): 83 # get a StreamReader for the encoding and feed the bytestring version 84 # of input to the reader byte by byte. Read everything available from 85 # the StreamReader and check that the results equal the appropriate 86 # entries from partialresults. 87 q = Queue(b"") 88 r = codecs.getreader(self.encoding)(q) 89 result = "" 90 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 91 q.write(bytes([c])) 92 result += r.read() 93 self.assertEqual(result, partialresult) 94 # check that there's nothing left in the buffers 95 self.assertEqual(r.read(), "") 96 self.assertEqual(r.bytebuffer, b"") 97 98 # do the check again, this time using an incremental decoder 99 d = codecs.getincrementaldecoder(self.encoding)() 100 result = "" 101 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 102 result += d.decode(bytes([c])) 103 self.assertEqual(result, partialresult) 104 # check that there's nothing left in the buffers 105 self.assertEqual(d.decode(b"", True), "") 106 self.assertEqual(d.buffer, b"") 107 108 # Check whether the reset method works properly 109 d.reset() 110 result = "" 111 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 112 result += d.decode(bytes([c])) 113 self.assertEqual(result, partialresult) 114 # check that there's nothing left in the buffers 115 self.assertEqual(d.decode(b"", True), "") 116 self.assertEqual(d.buffer, b"") 117 118 # check iterdecode() 119 encoded = input.encode(self.encoding) 120 self.assertEqual( 121 input, 122 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding)) 123 ) 124 125 def test_readline(self): 126 def getreader(input): 127 stream = io.BytesIO(input.encode(self.encoding)) 128 return codecs.getreader(self.encoding)(stream) 129 130 def readalllines(input, keepends=True, size=None): 131 reader = getreader(input) 132 lines = [] 133 while True: 134 line = reader.readline(size=size, keepends=keepends) 135 if not line: 136 break 137 lines.append(line) 138 return "|".join(lines) 139 140 s = "foo\nbar\r\nbaz\rspam\u2028eggs" 141 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs" 142 sexpectednoends = "foo|bar|baz|spam|eggs" 143 self.assertEqual(readalllines(s, True), sexpected) 144 self.assertEqual(readalllines(s, False), sexpectednoends) 145 self.assertEqual(readalllines(s, True, 10), sexpected) 146 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 147 148 lineends = ("\n", "\r\n", "\r", "\u2028") 149 # Test long lines (multiple calls to read() in readline()) 150 vw = [] 151 vwo = [] 152 for (i, lineend) in enumerate(lineends): 153 vw.append((i*200+200)*"\u3042" + lineend) 154 vwo.append((i*200+200)*"\u3042") 155 self.assertEqual(readalllines("".join(vw), True), "|".join(vw)) 156 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo)) 157 158 # Test lines where the first read might end with \r, so the 159 # reader has to look ahead whether this is a lone \r or a \r\n 160 for size in range(80): 161 for lineend in lineends: 162 s = 10*(size*"a" + lineend + "xxx\n") 163 reader = getreader(s) 164 for i in range(10): 165 self.assertEqual( 166 reader.readline(keepends=True), 167 size*"a" + lineend, 168 ) 169 self.assertEqual( 170 reader.readline(keepends=True), 171 "xxx\n", 172 ) 173 reader = getreader(s) 174 for i in range(10): 175 self.assertEqual( 176 reader.readline(keepends=False), 177 size*"a", 178 ) 179 self.assertEqual( 180 reader.readline(keepends=False), 181 "xxx", 182 ) 183 184 def test_mixed_readline_and_read(self): 185 lines = ["Humpty Dumpty sat on a wall,\n", 186 "Humpty Dumpty had a great fall.\r\n", 187 "All the king's horses and all the king's men\r", 188 "Couldn't put Humpty together again."] 189 data = ''.join(lines) 190 def getreader(): 191 stream = io.BytesIO(data.encode(self.encoding)) 192 return codecs.getreader(self.encoding)(stream) 193 194 # Issue #8260: Test readline() followed by read() 195 f = getreader() 196 self.assertEqual(f.readline(), lines[0]) 197 self.assertEqual(f.read(), ''.join(lines[1:])) 198 self.assertEqual(f.read(), '') 199 200 # Issue #32110: Test readline() followed by read(n) 201 f = getreader() 202 self.assertEqual(f.readline(), lines[0]) 203 self.assertEqual(f.read(1), lines[1][0]) 204 self.assertEqual(f.read(0), '') 205 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100]) 206 207 # Issue #16636: Test readline() followed by readlines() 208 f = getreader() 209 self.assertEqual(f.readline(), lines[0]) 210 self.assertEqual(f.readlines(), lines[1:]) 211 self.assertEqual(f.read(), '') 212 213 # Test read(n) followed by read() 214 f = getreader() 215 self.assertEqual(f.read(size=40, chars=5), data[:5]) 216 self.assertEqual(f.read(), data[5:]) 217 self.assertEqual(f.read(), '') 218 219 # Issue #32110: Test read(n) followed by read(n) 220 f = getreader() 221 self.assertEqual(f.read(size=40, chars=5), data[:5]) 222 self.assertEqual(f.read(1), data[5]) 223 self.assertEqual(f.read(0), '') 224 self.assertEqual(f.read(100), data[6:106]) 225 226 # Issue #12446: Test read(n) followed by readlines() 227 f = getreader() 228 self.assertEqual(f.read(size=40, chars=5), data[:5]) 229 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:]) 230 self.assertEqual(f.read(), '') 231 232 def test_bug1175396(self): 233 s = [ 234 '<%!--===================================================\r\n', 235 ' BLOG index page: show recent articles,\r\n', 236 ' today\'s articles, or articles of a specific date.\r\n', 237 '========================================================--%>\r\n', 238 '<%@inputencoding="ISO-8859-1"%>\r\n', 239 '<%@pagetemplate=TEMPLATE.y%>\r\n', 240 '<%@import=import frog.util, frog%>\r\n', 241 '<%@import=import frog.objects%>\r\n', 242 '<%@import=from frog.storageerrors import StorageError%>\r\n', 243 '<%\r\n', 244 '\r\n', 245 'import logging\r\n', 246 'log=logging.getLogger("Snakelets.logger")\r\n', 247 '\r\n', 248 '\r\n', 249 'user=self.SessionCtx.user\r\n', 250 'storageEngine=self.SessionCtx.storageEngine\r\n', 251 '\r\n', 252 '\r\n', 253 'def readArticlesFromDate(date, count=None):\r\n', 254 ' entryids=storageEngine.listBlogEntries(date)\r\n', 255 ' entryids.reverse() # descending\r\n', 256 ' if count:\r\n', 257 ' entryids=entryids[:count]\r\n', 258 ' try:\r\n', 259 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 260 ' except StorageError,x:\r\n', 261 ' log.error("Error loading articles: "+str(x))\r\n', 262 ' self.abort("cannot load articles")\r\n', 263 '\r\n', 264 'showdate=None\r\n', 265 '\r\n', 266 'arg=self.Request.getArg()\r\n', 267 'if arg=="today":\r\n', 268 ' #-------------------- TODAY\'S ARTICLES\r\n', 269 ' self.write("<h2>Today\'s articles</h2>")\r\n', 270 ' showdate = frog.util.isodatestr() \r\n', 271 ' entries = readArticlesFromDate(showdate)\r\n', 272 'elif arg=="active":\r\n', 273 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 274 ' self.Yredirect("active.y")\r\n', 275 'elif arg=="login":\r\n', 276 ' #-------------------- LOGIN PAGE redirect\r\n', 277 ' self.Yredirect("login.y")\r\n', 278 'elif arg=="date":\r\n', 279 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 280 ' showdate = self.Request.getParameter("date")\r\n', 281 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 282 ' entries = readArticlesFromDate(showdate)\r\n', 283 'else:\r\n', 284 ' #-------------------- RECENT ARTICLES\r\n', 285 ' self.write("<h2>Recent articles</h2>")\r\n', 286 ' dates=storageEngine.listBlogEntryDates()\r\n', 287 ' if dates:\r\n', 288 ' entries=[]\r\n', 289 ' SHOWAMOUNT=10\r\n', 290 ' for showdate in dates:\r\n', 291 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 292 ' if len(entries)>=SHOWAMOUNT:\r\n', 293 ' break\r\n', 294 ' \r\n', 295 ] 296 stream = io.BytesIO("".join(s).encode(self.encoding)) 297 reader = codecs.getreader(self.encoding)(stream) 298 for (i, line) in enumerate(reader): 299 self.assertEqual(line, s[i]) 300 301 def test_readlinequeue(self): 302 q = Queue(b"") 303 writer = codecs.getwriter(self.encoding)(q) 304 reader = codecs.getreader(self.encoding)(q) 305 306 # No lineends 307 writer.write("foo\r") 308 self.assertEqual(reader.readline(keepends=False), "foo") 309 writer.write("\nbar\r") 310 self.assertEqual(reader.readline(keepends=False), "") 311 self.assertEqual(reader.readline(keepends=False), "bar") 312 writer.write("baz") 313 self.assertEqual(reader.readline(keepends=False), "baz") 314 self.assertEqual(reader.readline(keepends=False), "") 315 316 # Lineends 317 writer.write("foo\r") 318 self.assertEqual(reader.readline(keepends=True), "foo\r") 319 writer.write("\nbar\r") 320 self.assertEqual(reader.readline(keepends=True), "\n") 321 self.assertEqual(reader.readline(keepends=True), "bar\r") 322 writer.write("baz") 323 self.assertEqual(reader.readline(keepends=True), "baz") 324 self.assertEqual(reader.readline(keepends=True), "") 325 writer.write("foo\r\n") 326 self.assertEqual(reader.readline(keepends=True), "foo\r\n") 327 328 def test_bug1098990_a(self): 329 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 330 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 331 s3 = "next line.\r\n" 332 333 s = (s1+s2+s3).encode(self.encoding) 334 stream = io.BytesIO(s) 335 reader = codecs.getreader(self.encoding)(stream) 336 self.assertEqual(reader.readline(), s1) 337 self.assertEqual(reader.readline(), s2) 338 self.assertEqual(reader.readline(), s3) 339 self.assertEqual(reader.readline(), "") 340 341 def test_bug1098990_b(self): 342 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 343 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 344 s3 = "stillokay:bbbbxx\r\n" 345 s4 = "broken!!!!badbad\r\n" 346 s5 = "againokay.\r\n" 347 348 s = (s1+s2+s3+s4+s5).encode(self.encoding) 349 stream = io.BytesIO(s) 350 reader = codecs.getreader(self.encoding)(stream) 351 self.assertEqual(reader.readline(), s1) 352 self.assertEqual(reader.readline(), s2) 353 self.assertEqual(reader.readline(), s3) 354 self.assertEqual(reader.readline(), s4) 355 self.assertEqual(reader.readline(), s5) 356 self.assertEqual(reader.readline(), "") 357 358 ill_formed_sequence_replace = "\ufffd" 359 360 def test_lone_surrogates(self): 361 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) 362 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), 363 "[\\udc80]".encode(self.encoding)) 364 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), 365 "[\\udc80]".encode(self.encoding)) 366 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), 367 "[�]".encode(self.encoding)) 368 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), 369 "[]".encode(self.encoding)) 370 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), 371 "[?]".encode(self.encoding)) 372 373 # sequential surrogate characters 374 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), 375 "[]".encode(self.encoding)) 376 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), 377 "[??]".encode(self.encoding)) 378 379 bom = "".encode(self.encoding) 380 for before, after in [("\U00010fff", "A"), ("[", "]"), 381 ("A", "\U00010fff")]: 382 before_sequence = before.encode(self.encoding)[len(bom):] 383 after_sequence = after.encode(self.encoding)[len(bom):] 384 test_string = before + "\uDC80" + after 385 test_sequence = (bom + before_sequence + 386 self.ill_formed_sequence + after_sequence) 387 self.assertRaises(UnicodeDecodeError, test_sequence.decode, 388 self.encoding) 389 self.assertEqual(test_string.encode(self.encoding, 390 "surrogatepass"), 391 test_sequence) 392 self.assertEqual(test_sequence.decode(self.encoding, 393 "surrogatepass"), 394 test_string) 395 self.assertEqual(test_sequence.decode(self.encoding, "ignore"), 396 before + after) 397 self.assertEqual(test_sequence.decode(self.encoding, "replace"), 398 before + self.ill_formed_sequence_replace + after) 399 backslashreplace = ''.join('\\x%02x' % b 400 for b in self.ill_formed_sequence) 401 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), 402 before + backslashreplace + after) 403 404 405 class UTF32Test(ReadTest, unittest.TestCase): 406 encoding = "utf-32" 407 if sys.byteorder == 'little': 408 ill_formed_sequence = b"\x80\xdc\x00\x00" 409 else: 410 ill_formed_sequence = b"\x00\x00\xdc\x80" 411 412 spamle = (b'\xff\xfe\x00\x00' 413 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 414 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 415 spambe = (b'\x00\x00\xfe\xff' 416 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 417 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 418 419 def test_only_one_bom(self): 420 _,_,reader,writer = codecs.lookup(self.encoding) 421 # encode some stream 422 s = io.BytesIO() 423 f = writer(s) 424 f.write("spam") 425 f.write("spam") 426 d = s.getvalue() 427 # check whether there is exactly one BOM in it 428 self.assertTrue(d == self.spamle or d == self.spambe) 429 # try to read it back 430 s = io.BytesIO(d) 431 f = reader(s) 432 self.assertEqual(f.read(), "spamspam") 433 434 def test_badbom(self): 435 s = io.BytesIO(4*b"\xff") 436 f = codecs.getreader(self.encoding)(s) 437 self.assertRaises(UnicodeError, f.read) 438 439 s = io.BytesIO(8*b"\xff") 440 f = codecs.getreader(self.encoding)(s) 441 self.assertRaises(UnicodeError, f.read) 442 443 def test_partial(self): 444 self.check_partial( 445 "\x00\xff\u0100\uffff\U00010000", 446 [ 447 "", # first byte of BOM read 448 "", # second byte of BOM read 449 "", # third byte of BOM read 450 "", # fourth byte of BOM read => byteorder known 451 "", 452 "", 453 "", 454 "\x00", 455 "\x00", 456 "\x00", 457 "\x00", 458 "\x00\xff", 459 "\x00\xff", 460 "\x00\xff", 461 "\x00\xff", 462 "\x00\xff\u0100", 463 "\x00\xff\u0100", 464 "\x00\xff\u0100", 465 "\x00\xff\u0100", 466 "\x00\xff\u0100\uffff", 467 "\x00\xff\u0100\uffff", 468 "\x00\xff\u0100\uffff", 469 "\x00\xff\u0100\uffff", 470 "\x00\xff\u0100\uffff\U00010000", 471 ] 472 ) 473 474 def test_handlers(self): 475 self.assertEqual(('\ufffd', 1), 476 codecs.utf_32_decode(b'\x01', 'replace', True)) 477 self.assertEqual(('', 1), 478 codecs.utf_32_decode(b'\x01', 'ignore', True)) 479 480 def test_errors(self): 481 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 482 b"\xff", "strict", True) 483 484 def test_decoder_state(self): 485 self.check_state_handling_decode(self.encoding, 486 "spamspam", self.spamle) 487 self.check_state_handling_decode(self.encoding, 488 "spamspam", self.spambe) 489 490 def test_issue8941(self): 491 # Issue #8941: insufficient result allocation when decoding into 492 # surrogate pairs on UCS-2 builds. 493 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024 494 self.assertEqual('\U00010000' * 1024, 495 codecs.utf_32_decode(encoded_le)[0]) 496 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024 497 self.assertEqual('\U00010000' * 1024, 498 codecs.utf_32_decode(encoded_be)[0]) 499 500 501 class UTF32LETest(ReadTest, unittest.TestCase): 502 encoding = "utf-32-le" 503 ill_formed_sequence = b"\x80\xdc\x00\x00" 504 505 def test_partial(self): 506 self.check_partial( 507 "\x00\xff\u0100\uffff\U00010000", 508 [ 509 "", 510 "", 511 "", 512 "\x00", 513 "\x00", 514 "\x00", 515 "\x00", 516 "\x00\xff", 517 "\x00\xff", 518 "\x00\xff", 519 "\x00\xff", 520 "\x00\xff\u0100", 521 "\x00\xff\u0100", 522 "\x00\xff\u0100", 523 "\x00\xff\u0100", 524 "\x00\xff\u0100\uffff", 525 "\x00\xff\u0100\uffff", 526 "\x00\xff\u0100\uffff", 527 "\x00\xff\u0100\uffff", 528 "\x00\xff\u0100\uffff\U00010000", 529 ] 530 ) 531 532 def test_simple(self): 533 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") 534 535 def test_errors(self): 536 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 537 b"\xff", "strict", True) 538 539 def test_issue8941(self): 540 # Issue #8941: insufficient result allocation when decoding into 541 # surrogate pairs on UCS-2 builds. 542 encoded = b'\x00\x00\x01\x00' * 1024 543 self.assertEqual('\U00010000' * 1024, 544 codecs.utf_32_le_decode(encoded)[0]) 545 546 547 class UTF32BETest(ReadTest, unittest.TestCase): 548 encoding = "utf-32-be" 549 ill_formed_sequence = b"\x00\x00\xdc\x80" 550 551 def test_partial(self): 552 self.check_partial( 553 "\x00\xff\u0100\uffff\U00010000", 554 [ 555 "", 556 "", 557 "", 558 "\x00", 559 "\x00", 560 "\x00", 561 "\x00", 562 "\x00\xff", 563 "\x00\xff", 564 "\x00\xff", 565 "\x00\xff", 566 "\x00\xff\u0100", 567 "\x00\xff\u0100", 568 "\x00\xff\u0100", 569 "\x00\xff\u0100", 570 "\x00\xff\u0100\uffff", 571 "\x00\xff\u0100\uffff", 572 "\x00\xff\u0100\uffff", 573 "\x00\xff\u0100\uffff", 574 "\x00\xff\u0100\uffff\U00010000", 575 ] 576 ) 577 578 def test_simple(self): 579 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") 580 581 def test_errors(self): 582 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 583 b"\xff", "strict", True) 584 585 def test_issue8941(self): 586 # Issue #8941: insufficient result allocation when decoding into 587 # surrogate pairs on UCS-2 builds. 588 encoded = b'\x00\x01\x00\x00' * 1024 589 self.assertEqual('\U00010000' * 1024, 590 codecs.utf_32_be_decode(encoded)[0]) 591 592 593 class UTF16Test(ReadTest, unittest.TestCase): 594 encoding = "utf-16" 595 if sys.byteorder == 'little': 596 ill_formed_sequence = b"\x80\xdc" 597 else: 598 ill_formed_sequence = b"\xdc\x80" 599 600 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 601 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 602 603 def test_only_one_bom(self): 604 _,_,reader,writer = codecs.lookup(self.encoding) 605 # encode some stream 606 s = io.BytesIO() 607 f = writer(s) 608 f.write("spam") 609 f.write("spam") 610 d = s.getvalue() 611 # check whether there is exactly one BOM in it 612 self.assertTrue(d == self.spamle or d == self.spambe) 613 # try to read it back 614 s = io.BytesIO(d) 615 f = reader(s) 616 self.assertEqual(f.read(), "spamspam") 617 618 def test_badbom(self): 619 s = io.BytesIO(b"\xff\xff") 620 f = codecs.getreader(self.encoding)(s) 621 self.assertRaises(UnicodeError, f.read) 622 623 s = io.BytesIO(b"\xff\xff\xff\xff") 624 f = codecs.getreader(self.encoding)(s) 625 self.assertRaises(UnicodeError, f.read) 626 627 def test_partial(self): 628 self.check_partial( 629 "\x00\xff\u0100\uffff\U00010000", 630 [ 631 "", # first byte of BOM read 632 "", # second byte of BOM read => byteorder known 633 "", 634 "\x00", 635 "\x00", 636 "\x00\xff", 637 "\x00\xff", 638 "\x00\xff\u0100", 639 "\x00\xff\u0100", 640 "\x00\xff\u0100\uffff", 641 "\x00\xff\u0100\uffff", 642 "\x00\xff\u0100\uffff", 643 "\x00\xff\u0100\uffff", 644 "\x00\xff\u0100\uffff\U00010000", 645 ] 646 ) 647 648 def test_handlers(self): 649 self.assertEqual(('\ufffd', 1), 650 codecs.utf_16_decode(b'\x01', 'replace', True)) 651 self.assertEqual(('', 1), 652 codecs.utf_16_decode(b'\x01', 'ignore', True)) 653 654 def test_errors(self): 655 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, 656 b"\xff", "strict", True) 657 658 def test_decoder_state(self): 659 self.check_state_handling_decode(self.encoding, 660 "spamspam", self.spamle) 661 self.check_state_handling_decode(self.encoding, 662 "spamspam", self.spambe) 663 664 def test_bug691291(self): 665 # Files are always opened in binary mode, even if no binary mode was 666 # specified. This means that no automatic conversion of '\n' is done 667 # on reading and writing. 668 s1 = 'Hello\r\nworld\r\n' 669 670 s = s1.encode(self.encoding) 671 self.addCleanup(support.unlink, support.TESTFN) 672 with open(support.TESTFN, 'wb') as fp: 673 fp.write(s) 674 with support.check_warnings(('', DeprecationWarning)): 675 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding) 676 with reader: 677 self.assertEqual(reader.read(), s1) 678 679 class UTF16LETest(ReadTest, unittest.TestCase): 680 encoding = "utf-16-le" 681 ill_formed_sequence = b"\x80\xdc" 682 683 def test_partial(self): 684 self.check_partial( 685 "\x00\xff\u0100\uffff\U00010000", 686 [ 687 "", 688 "\x00", 689 "\x00", 690 "\x00\xff", 691 "\x00\xff", 692 "\x00\xff\u0100", 693 "\x00\xff\u0100", 694 "\x00\xff\u0100\uffff", 695 "\x00\xff\u0100\uffff", 696 "\x00\xff\u0100\uffff", 697 "\x00\xff\u0100\uffff", 698 "\x00\xff\u0100\uffff\U00010000", 699 ] 700 ) 701 702 def test_errors(self): 703 tests = [ 704 (b'\xff', '\ufffd'), 705 (b'A\x00Z', 'A\ufffd'), 706 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'), 707 (b'\x00\xd8', '\ufffd'), 708 (b'\x00\xd8A', '\ufffd'), 709 (b'\x00\xd8A\x00', '\ufffdA'), 710 (b'\x00\xdcA\x00', '\ufffdA'), 711 ] 712 for raw, expected in tests: 713 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, 714 raw, 'strict', True) 715 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 716 717 def test_nonbmp(self): 718 self.assertEqual("\U00010203".encode(self.encoding), 719 b'\x00\xd8\x03\xde') 720 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), 721 "\U00010203") 722 723 class UTF16BETest(ReadTest, unittest.TestCase): 724 encoding = "utf-16-be" 725 ill_formed_sequence = b"\xdc\x80" 726 727 def test_partial(self): 728 self.check_partial( 729 "\x00\xff\u0100\uffff\U00010000", 730 [ 731 "", 732 "\x00", 733 "\x00", 734 "\x00\xff", 735 "\x00\xff", 736 "\x00\xff\u0100", 737 "\x00\xff\u0100", 738 "\x00\xff\u0100\uffff", 739 "\x00\xff\u0100\uffff", 740 "\x00\xff\u0100\uffff", 741 "\x00\xff\u0100\uffff", 742 "\x00\xff\u0100\uffff\U00010000", 743 ] 744 ) 745 746 def test_errors(self): 747 tests = [ 748 (b'\xff', '\ufffd'), 749 (b'\x00A\xff', 'A\ufffd'), 750 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'), 751 (b'\xd8\x00', '\ufffd'), 752 (b'\xd8\x00\xdc', '\ufffd'), 753 (b'\xd8\x00\x00A', '\ufffdA'), 754 (b'\xdc\x00\x00A', '\ufffdA'), 755 ] 756 for raw, expected in tests: 757 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, 758 raw, 'strict', True) 759 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 760 761 def test_nonbmp(self): 762 self.assertEqual("\U00010203".encode(self.encoding), 763 b'\xd8\x00\xde\x03') 764 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), 765 "\U00010203") 766 767 class UTF8Test(ReadTest, unittest.TestCase): 768 encoding = "utf-8" 769 ill_formed_sequence = b"\xed\xb2\x80" 770 ill_formed_sequence_replace = "\ufffd" * 3 771 BOM = b'' 772 773 def test_partial(self): 774 self.check_partial( 775 "\x00\xff\u07ff\u0800\uffff\U00010000", 776 [ 777 "\x00", 778 "\x00", 779 "\x00\xff", 780 "\x00\xff", 781 "\x00\xff\u07ff", 782 "\x00\xff\u07ff", 783 "\x00\xff\u07ff", 784 "\x00\xff\u07ff\u0800", 785 "\x00\xff\u07ff\u0800", 786 "\x00\xff\u07ff\u0800", 787 "\x00\xff\u07ff\u0800\uffff", 788 "\x00\xff\u07ff\u0800\uffff", 789 "\x00\xff\u07ff\u0800\uffff", 790 "\x00\xff\u07ff\u0800\uffff", 791 "\x00\xff\u07ff\u0800\uffff\U00010000", 792 ] 793 ) 794 795 def test_decoder_state(self): 796 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" 797 self.check_state_handling_decode(self.encoding, 798 u, u.encode(self.encoding)) 799 800 def test_decode_error(self): 801 for data, error_handler, expected in ( 802 (b'[\x80\xff]', 'ignore', '[]'), 803 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 804 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 805 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 806 ): 807 with self.subTest(data=data, error_handler=error_handler, 808 expected=expected): 809 self.assertEqual(data.decode(self.encoding, error_handler), 810 expected) 811 812 def test_lone_surrogates(self): 813 super().test_lone_surrogates() 814 # not sure if this is making sense for 815 # UTF-16 and UTF-32 816 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), 817 self.BOM + b'[\x80]') 818 819 with self.assertRaises(UnicodeEncodeError) as cm: 820 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") 821 exc = cm.exception 822 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') 823 824 def test_surrogatepass_handler(self): 825 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), 826 self.BOM + b"abc\xed\xa0\x80def") 827 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), 828 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") 829 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), 830 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') 831 832 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), 833 "abc\ud800def") 834 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), 835 "\U00010fff\uD800") 836 837 self.assertTrue(codecs.lookup_error("surrogatepass")) 838 with self.assertRaises(UnicodeDecodeError): 839 b"abc\xed\xa0".decode(self.encoding, "surrogatepass") 840 with self.assertRaises(UnicodeDecodeError): 841 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") 842 843 844 @unittest.skipUnless(sys.platform == 'win32', 845 'cp65001 is a Windows-only codec') 846 class CP65001Test(ReadTest, unittest.TestCase): 847 encoding = "cp65001" 848 849 def test_encode(self): 850 tests = [ 851 ('abc', 'strict', b'abc'), 852 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), 853 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), 854 ('\udc80', 'strict', None), 855 ('\udc80', 'ignore', b''), 856 ('\udc80', 'replace', b'?'), 857 ('\udc80', 'backslashreplace', b'\\udc80'), 858 ('\udc80', 'namereplace', b'\\udc80'), 859 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), 860 ] 861 for text, errors, expected in tests: 862 if expected is not None: 863 try: 864 encoded = text.encode('cp65001', errors) 865 except UnicodeEncodeError as err: 866 self.fail('Unable to encode %a to cp65001 with ' 867 'errors=%r: %s' % (text, errors, err)) 868 self.assertEqual(encoded, expected, 869 '%a.encode("cp65001", %r)=%a != %a' 870 % (text, errors, encoded, expected)) 871 else: 872 self.assertRaises(UnicodeEncodeError, 873 text.encode, "cp65001", errors) 874 875 def test_decode(self): 876 tests = [ 877 (b'abc', 'strict', 'abc'), 878 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), 879 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), 880 (b'\xef\xbf\xbd', 'strict', '\ufffd'), 881 (b'[\xc3\xa9]', 'strict', '[\xe9]'), 882 # invalid bytes 883 (b'[\xff]', 'strict', None), 884 (b'[\xff]', 'ignore', '[]'), 885 (b'[\xff]', 'replace', '[\ufffd]'), 886 (b'[\xff]', 'surrogateescape', '[\udcff]'), 887 (b'[\xed\xb2\x80]', 'strict', None), 888 (b'[\xed\xb2\x80]', 'ignore', '[]'), 889 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), 890 ] 891 for raw, errors, expected in tests: 892 if expected is not None: 893 try: 894 decoded = raw.decode('cp65001', errors) 895 except UnicodeDecodeError as err: 896 self.fail('Unable to decode %a from cp65001 with ' 897 'errors=%r: %s' % (raw, errors, err)) 898 self.assertEqual(decoded, expected, 899 '%a.decode("cp65001", %r)=%a != %a' 900 % (raw, errors, decoded, expected)) 901 else: 902 self.assertRaises(UnicodeDecodeError, 903 raw.decode, 'cp65001', errors) 904 905 def test_lone_surrogates(self): 906 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") 907 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") 908 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), 909 b'[\\udc80]') 910 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), 911 b'[\\udc80]') 912 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), 913 b'[�]') 914 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), 915 b'[\x80]') 916 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"), 917 b'[]') 918 self.assertEqual("[\uDC80]".encode("cp65001", "replace"), 919 b'[?]') 920 921 def test_surrogatepass_handler(self): 922 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), 923 b"abc\xed\xa0\x80def") 924 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"), 925 "abc\ud800def") 926 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"), 927 b"\xf0\x90\xbf\xbf\xed\xa0\x80") 928 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"), 929 "\U00010fff\uD800") 930 self.assertTrue(codecs.lookup_error("surrogatepass")) 931 932 933 class UTF7Test(ReadTest, unittest.TestCase): 934 encoding = "utf-7" 935 936 def test_ascii(self): 937 # Set D (directly encoded characters) 938 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 939 'abcdefghijklmnopqrstuvwxyz' 940 '0123456789' 941 '\'(),-./:?') 942 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) 943 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) 944 # Set O (optional direct characters) 945 set_o = ' !"#$%&*;<=>@[]^_`{|}' 946 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) 947 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) 948 # + 949 self.assertEqual('a+b'.encode(self.encoding), b'a+-b') 950 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') 951 # White spaces 952 ws = ' \t\n\r' 953 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) 954 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) 955 # Other ASCII characters 956 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - 957 set(set_d + set_o + '+' + ws))) 958 self.assertEqual(other_ascii.encode(self.encoding), 959 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' 960 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') 961 962 def test_partial(self): 963 self.check_partial( 964 'a+-b\x00c\x80d\u0100e\U00010000f', 965 [ 966 'a', 967 'a', 968 'a+', 969 'a+-', 970 'a+-b', 971 'a+-b', 972 'a+-b', 973 'a+-b', 974 'a+-b', 975 'a+-b\x00', 976 'a+-b\x00c', 977 'a+-b\x00c', 978 'a+-b\x00c', 979 'a+-b\x00c', 980 'a+-b\x00c', 981 'a+-b\x00c\x80', 982 'a+-b\x00c\x80d', 983 'a+-b\x00c\x80d', 984 'a+-b\x00c\x80d', 985 'a+-b\x00c\x80d', 986 'a+-b\x00c\x80d', 987 'a+-b\x00c\x80d\u0100', 988 'a+-b\x00c\x80d\u0100e', 989 'a+-b\x00c\x80d\u0100e', 990 'a+-b\x00c\x80d\u0100e', 991 'a+-b\x00c\x80d\u0100e', 992 'a+-b\x00c\x80d\u0100e', 993 'a+-b\x00c\x80d\u0100e', 994 'a+-b\x00c\x80d\u0100e', 995 'a+-b\x00c\x80d\u0100e', 996 'a+-b\x00c\x80d\u0100e\U00010000', 997 'a+-b\x00c\x80d\u0100e\U00010000f', 998 ] 999 ) 1000 1001 def test_errors(self): 1002 tests = [ 1003 (b'\xffb', '\ufffdb'), 1004 (b'a\xffb', 'a\ufffdb'), 1005 (b'a\xff\xffb', 'a\ufffd\ufffdb'), 1006 (b'a+IK', 'a\ufffd'), 1007 (b'a+IK-b', 'a\ufffdb'), 1008 (b'a+IK,b', 'a\ufffdb'), 1009 (b'a+IKx', 'a\u20ac\ufffd'), 1010 (b'a+IKx-b', 'a\u20ac\ufffdb'), 1011 (b'a+IKwgr', 'a\u20ac\ufffd'), 1012 (b'a+IKwgr-b', 'a\u20ac\ufffdb'), 1013 (b'a+IKwgr,', 'a\u20ac\ufffd'), 1014 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'), 1015 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'), 1016 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'), 1017 (b'a+/,+IKw-b', 'a\ufffd\u20acb'), 1018 (b'a+//,+IKw-b', 'a\ufffd\u20acb'), 1019 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), 1020 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), 1021 (b'a+IKw-b\xff', 'a\u20acb\ufffd'), 1022 (b'a+IKw\xffb', 'a\u20ac\ufffdb'), 1023 ] 1024 for raw, expected in tests: 1025 with self.subTest(raw=raw): 1026 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, 1027 raw, 'strict', True) 1028 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1029 1030 def test_nonbmp(self): 1031 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') 1032 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') 1033 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') 1034 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') 1035 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') 1036 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') 1037 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') 1038 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), 1039 b'+IKwgrNgB3KA-') 1040 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), 1041 '\u20ac\u20ac\U000104A0') 1042 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), 1043 '\u20ac\u20ac\U000104A0') 1044 1045 def test_lone_surrogates(self): 1046 tests = [ 1047 (b'a+2AE-b', 'a\ud801b'), 1048 (b'a+2AE\xffb', 'a\ufffdb'), 1049 (b'a+2AE', 'a\ufffd'), 1050 (b'a+2AEA-b', 'a\ufffdb'), 1051 (b'a+2AH-b', 'a\ufffdb'), 1052 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), 1053 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), 1054 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), 1055 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), 1056 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), 1057 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), 1058 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), 1059 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), 1060 ] 1061 for raw, expected in tests: 1062 with self.subTest(raw=raw): 1063 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 1064 1065 1066 class UTF16ExTest(unittest.TestCase): 1067 1068 def test_errors(self): 1069 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True) 1070 1071 def test_bad_args(self): 1072 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 1073 1074 class ReadBufferTest(unittest.TestCase): 1075 1076 def test_array(self): 1077 import array 1078 self.assertEqual( 1079 codecs.readbuffer_encode(array.array("b", b"spam")), 1080 (b"spam", 4) 1081 ) 1082 1083 def test_empty(self): 1084 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0)) 1085 1086 def test_bad_args(self): 1087 self.assertRaises(TypeError, codecs.readbuffer_encode) 1088 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 1089 1090 class UTF8SigTest(UTF8Test, unittest.TestCase): 1091 encoding = "utf-8-sig" 1092 BOM = codecs.BOM_UTF8 1093 1094 def test_partial(self): 1095 self.check_partial( 1096 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1097 [ 1098 "", 1099 "", 1100 "", # First BOM has been read and skipped 1101 "", 1102 "", 1103 "\ufeff", # Second BOM has been read and emitted 1104 "\ufeff\x00", # "\x00" read and emitted 1105 "\ufeff\x00", # First byte of encoded "\xff" read 1106 "\ufeff\x00\xff", # Second byte of encoded "\xff" read 1107 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read 1108 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read 1109 "\ufeff\x00\xff\u07ff", 1110 "\ufeff\x00\xff\u07ff", 1111 "\ufeff\x00\xff\u07ff\u0800", 1112 "\ufeff\x00\xff\u07ff\u0800", 1113 "\ufeff\x00\xff\u07ff\u0800", 1114 "\ufeff\x00\xff\u07ff\u0800\uffff", 1115 "\ufeff\x00\xff\u07ff\u0800\uffff", 1116 "\ufeff\x00\xff\u07ff\u0800\uffff", 1117 "\ufeff\x00\xff\u07ff\u0800\uffff", 1118 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 1119 ] 1120 ) 1121 1122 def test_bug1601501(self): 1123 # SF bug #1601501: check that the codec works with a buffer 1124 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "") 1125 1126 def test_bom(self): 1127 d = codecs.getincrementaldecoder("utf-8-sig")() 1128 s = "spam" 1129 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 1130 1131 def test_stream_bom(self): 1132 unistring = "ABC\u00A1\u2200XYZ" 1133 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1134 1135 reader = codecs.getreader("utf-8-sig") 1136 for sizehint in [None] + list(range(1, 11)) + \ 1137 [64, 128, 256, 512, 1024]: 1138 istream = reader(io.BytesIO(bytestring)) 1139 ostream = io.StringIO() 1140 while 1: 1141 if sizehint is not None: 1142 data = istream.read(sizehint) 1143 else: 1144 data = istream.read() 1145 1146 if not data: 1147 break 1148 ostream.write(data) 1149 1150 got = ostream.getvalue() 1151 self.assertEqual(got, unistring) 1152 1153 def test_stream_bare(self): 1154 unistring = "ABC\u00A1\u2200XYZ" 1155 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ" 1156 1157 reader = codecs.getreader("utf-8-sig") 1158 for sizehint in [None] + list(range(1, 11)) + \ 1159 [64, 128, 256, 512, 1024]: 1160 istream = reader(io.BytesIO(bytestring)) 1161 ostream = io.StringIO() 1162 while 1: 1163 if sizehint is not None: 1164 data = istream.read(sizehint) 1165 else: 1166 data = istream.read() 1167 1168 if not data: 1169 break 1170 ostream.write(data) 1171 1172 got = ostream.getvalue() 1173 self.assertEqual(got, unistring) 1174 1175 class EscapeDecodeTest(unittest.TestCase): 1176 def test_empty(self): 1177 self.assertEqual(codecs.escape_decode(b""), (b"", 0)) 1178 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0)) 1179 1180 def test_raw(self): 1181 decode = codecs.escape_decode 1182 for b in range(256): 1183 b = bytes([b]) 1184 if b != b'\\': 1185 self.assertEqual(decode(b + b'0'), (b + b'0', 2)) 1186 1187 def test_escape(self): 1188 decode = codecs.escape_decode 1189 check = coding_checker(self, decode) 1190 check(b"[\\\n]", b"[]") 1191 check(br'[\"]', b'["]') 1192 check(br"[\']", b"[']") 1193 check(br"[\\]", b"[\\]") 1194 check(br"[\a]", b"[\x07]") 1195 check(br"[\b]", b"[\x08]") 1196 check(br"[\t]", b"[\x09]") 1197 check(br"[\n]", b"[\x0a]") 1198 check(br"[\v]", b"[\x0b]") 1199 check(br"[\f]", b"[\x0c]") 1200 check(br"[\r]", b"[\x0d]") 1201 check(br"[\7]", b"[\x07]") 1202 check(br"[\78]", b"[\x078]") 1203 check(br"[\41]", b"[!]") 1204 check(br"[\418]", b"[!8]") 1205 check(br"[\101]", b"[A]") 1206 check(br"[\1010]", b"[A0]") 1207 check(br"[\501]", b"[A]") 1208 check(br"[\x41]", b"[A]") 1209 check(br"[\x410]", b"[A0]") 1210 for i in range(97, 123): 1211 b = bytes([i]) 1212 if b not in b'abfnrtvx': 1213 with self.assertWarns(DeprecationWarning): 1214 check(b"\\" + b, b"\\" + b) 1215 with self.assertWarns(DeprecationWarning): 1216 check(b"\\" + b.upper(), b"\\" + b.upper()) 1217 with self.assertWarns(DeprecationWarning): 1218 check(br"\8", b"\\8") 1219 with self.assertWarns(DeprecationWarning): 1220 check(br"\9", b"\\9") 1221 with self.assertWarns(DeprecationWarning): 1222 check(b"\\\xfa", b"\\\xfa") 1223 1224 def test_errors(self): 1225 decode = codecs.escape_decode 1226 self.assertRaises(ValueError, decode, br"\x") 1227 self.assertRaises(ValueError, decode, br"[\x]") 1228 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 1229 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 1230 self.assertRaises(ValueError, decode, br"\x0") 1231 self.assertRaises(ValueError, decode, br"[\x0]") 1232 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 1233 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 1234 1235 1236 class RecodingTest(unittest.TestCase): 1237 def test_recoding(self): 1238 f = io.BytesIO() 1239 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8") 1240 f2.write("a") 1241 f2.close() 1242 # Python used to crash on this at exit because of a refcount 1243 # bug in _codecsmodule.c 1244 1245 self.assertTrue(f.closed) 1246 1247 # From RFC 3492 1248 punycode_testcases = [ 1249 # A Arabic (Egyptian): 1250 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1251 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 1252 b"egbpdaj6bu4bxfgehfvwxn"), 1253 # B Chinese (simplified): 1254 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 1255 b"ihqwcrb4cv8a8dqg056pqjye"), 1256 # C Chinese (traditional): 1257 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 1258 b"ihqwctvzc91f659drss3x8bo0yb"), 1259 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 1260 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 1261 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 1262 "\u0065\u0073\u006B\u0079", 1263 b"Proprostnemluvesky-uyb24dma41a"), 1264 # E Hebrew: 1265 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 1266 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 1267 "\u05D1\u05E8\u05D9\u05EA", 1268 b"4dbcagdahymbxekheh6e0a7fei0b"), 1269 # F Hindi (Devanagari): 1270 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 1271 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 1272 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 1273 "\u0939\u0948\u0902", 1274 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 1275 1276 #(G) Japanese (kanji and hiragana): 1277 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 1278 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 1279 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 1280 1281 # (H) Korean (Hangul syllables): 1282 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 1283 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 1284 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 1285 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 1286 b"psd879ccm6fea98c"), 1287 1288 # (I) Russian (Cyrillic): 1289 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 1290 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 1291 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 1292 "\u0438", 1293 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 1294 1295 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 1296 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 1297 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 1298 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 1299 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 1300 "\u0061\u00F1\u006F\u006C", 1301 b"PorqunopuedensimplementehablarenEspaol-fmd56a"), 1302 1303 # (K) Vietnamese: 1304 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 1305 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 1306 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 1307 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 1308 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 1309 "\u0056\u0069\u1EC7\u0074", 1310 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 1311 1312 #(L) 3<nen>B<gumi><kinpachi><sensei> 1313 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 1314 b"3B-ww4c5e180e575a65lsy2b"), 1315 1316 # (M) <amuro><namie>-with-SUPER-MONKEYS 1317 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 1318 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 1319 "\u004F\u004E\u004B\u0045\u0059\u0053", 1320 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 1321 1322 # (N) Hello-Another-Way-<sorezore><no><basho> 1323 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 1324 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 1325 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 1326 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"), 1327 1328 # (O) <hitotsu><yane><no><shita>2 1329 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 1330 b"2-u9tlzr9756bt3uc0v"), 1331 1332 # (P) Maji<de>Koi<suru>5<byou><mae> 1333 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 1334 "\u308B\u0035\u79D2\u524D", 1335 b"MajiKoi5-783gue6qz075azm5e"), 1336 1337 # (Q) <pafii>de<runba> 1338 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 1339 b"de-jg4avhby1noc0d"), 1340 1341 # (R) <sono><supiido><de> 1342 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 1343 b"d9juau41awczczp"), 1344 1345 # (S) -> $1.00 <- 1346 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 1347 "\u003C\u002D", 1348 b"-> $1.00 <--") 1349 ] 1350 1351 for i in punycode_testcases: 1352 if len(i)!=2: 1353 print(repr(i)) 1354 1355 1356 class PunycodeTest(unittest.TestCase): 1357 def test_encode(self): 1358 for uni, puny in punycode_testcases: 1359 # Need to convert both strings to lower case, since 1360 # some of the extended encodings use upper case, but our 1361 # code produces only lower case. Converting just puny to 1362 # lower is also insufficient, since some of the input characters 1363 # are upper case. 1364 self.assertEqual( 1365 str(uni.encode("punycode"), "ascii").lower(), 1366 str(puny, "ascii").lower() 1367 ) 1368 1369 def test_decode(self): 1370 for uni, puny in punycode_testcases: 1371 self.assertEqual(uni, puny.decode("punycode")) 1372 puny = puny.decode("ascii").encode("ascii") 1373 self.assertEqual(uni, puny.decode("punycode")) 1374 1375 1376 class UnicodeInternalTest(unittest.TestCase): 1377 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') 1378 def test_bug1251300(self): 1379 # Decoding with unicode_internal used to not correctly handle "code 1380 # points" above 0x10ffff on UCS-4 builds. 1381 ok = [ 1382 (b"\x00\x10\xff\xff", "\U0010ffff"), 1383 (b"\x00\x00\x01\x01", "\U00000101"), 1384 (b"", ""), 1385 ] 1386 not_ok = [ 1387 b"\x7f\xff\xff\xff", 1388 b"\x80\x00\x00\x00", 1389 b"\x81\x00\x00\x00", 1390 b"\x00", 1391 b"\x00\x00\x00\x00\x00", 1392 ] 1393 for internal, uni in ok: 1394 if sys.byteorder == "little": 1395 internal = bytes(reversed(internal)) 1396 with support.check_warnings(): 1397 self.assertEqual(uni, internal.decode("unicode_internal")) 1398 for internal in not_ok: 1399 if sys.byteorder == "little": 1400 internal = bytes(reversed(internal)) 1401 with support.check_warnings(('unicode_internal codec has been ' 1402 'deprecated', DeprecationWarning)): 1403 self.assertRaises(UnicodeDecodeError, internal.decode, 1404 "unicode_internal") 1405 if sys.byteorder == "little": 1406 invalid = b"\x00\x00\x11\x00" 1407 invalid_backslashreplace = r"\x00\x00\x11\x00" 1408 else: 1409 invalid = b"\x00\x11\x00\x00" 1410 invalid_backslashreplace = r"\x00\x11\x00\x00" 1411 with support.check_warnings(): 1412 self.assertRaises(UnicodeDecodeError, 1413 invalid.decode, "unicode_internal") 1414 with support.check_warnings(): 1415 self.assertEqual(invalid.decode("unicode_internal", "replace"), 1416 '\ufffd') 1417 with support.check_warnings(): 1418 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"), 1419 invalid_backslashreplace) 1420 1421 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') 1422 def test_decode_error_attributes(self): 1423 try: 1424 with support.check_warnings(('unicode_internal codec has been ' 1425 'deprecated', DeprecationWarning)): 1426 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") 1427 except UnicodeDecodeError as ex: 1428 self.assertEqual("unicode_internal", ex.encoding) 1429 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) 1430 self.assertEqual(4, ex.start) 1431 self.assertEqual(8, ex.end) 1432 else: 1433 self.fail() 1434 1435 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') 1436 def test_decode_callback(self): 1437 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) 1438 decoder = codecs.getdecoder("unicode_internal") 1439 with support.check_warnings(('unicode_internal codec has been ' 1440 'deprecated', DeprecationWarning)): 1441 ab = "ab".encode("unicode_internal").decode() 1442 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), 1443 "ascii"), 1444 "UnicodeInternalTest") 1445 self.assertEqual(("ab", 12), ignored) 1446 1447 def test_encode_length(self): 1448 with support.check_warnings(('unicode_internal codec has been ' 1449 'deprecated', DeprecationWarning)): 1450 # Issue 3739 1451 encoder = codecs.getencoder("unicode_internal") 1452 self.assertEqual(encoder("a")[1], 1) 1453 self.assertEqual(encoder("\xe9\u0142")[1], 2) 1454 1455 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4) 1456 1457 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 1458 nameprep_tests = [ 1459 # 3.1 Map to nothing. 1460 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 1461 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 1462 b'\xb8\x8f\xef\xbb\xbf', 1463 b'foobarbaz'), 1464 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 1465 (b'CAFE', 1466 b'cafe'), 1467 # 3.3 Case folding 8bit U+00DF (german sharp s). 1468 # The original test case is bogus; it says \xc3\xdf 1469 (b'\xc3\x9f', 1470 b'ss'), 1471 # 3.4 Case folding U+0130 (turkish capital I with dot). 1472 (b'\xc4\xb0', 1473 b'i\xcc\x87'), 1474 # 3.5 Case folding multibyte U+0143 U+037A. 1475 (b'\xc5\x83\xcd\xba', 1476 b'\xc5\x84 \xce\xb9'), 1477 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 1478 # XXX: skip this as it fails in UCS-2 mode 1479 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 1480 # 'telc\xe2\x88\x95kg\xcf\x83'), 1481 (None, None), 1482 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1483 (b'j\xcc\x8c\xc2\xa0\xc2\xaa', 1484 b'\xc7\xb0 a'), 1485 # 3.8 Case folding U+1FB7 and normalization. 1486 (b'\xe1\xbe\xb7', 1487 b'\xe1\xbe\xb6\xce\xb9'), 1488 # 3.9 Self-reverting case folding U+01F0 and normalization. 1489 # The original test case is bogus, it says `\xc7\xf0' 1490 (b'\xc7\xb0', 1491 b'\xc7\xb0'), 1492 # 3.10 Self-reverting case folding U+0390 and normalization. 1493 (b'\xce\x90', 1494 b'\xce\x90'), 1495 # 3.11 Self-reverting case folding U+03B0 and normalization. 1496 (b'\xce\xb0', 1497 b'\xce\xb0'), 1498 # 3.12 Self-reverting case folding U+1E96 and normalization. 1499 (b'\xe1\xba\x96', 1500 b'\xe1\xba\x96'), 1501 # 3.13 Self-reverting case folding U+1F56 and normalization. 1502 (b'\xe1\xbd\x96', 1503 b'\xe1\xbd\x96'), 1504 # 3.14 ASCII space character U+0020. 1505 (b' ', 1506 b' '), 1507 # 3.15 Non-ASCII 8bit space character U+00A0. 1508 (b'\xc2\xa0', 1509 b' '), 1510 # 3.16 Non-ASCII multibyte space character U+1680. 1511 (b'\xe1\x9a\x80', 1512 None), 1513 # 3.17 Non-ASCII multibyte space character U+2000. 1514 (b'\xe2\x80\x80', 1515 b' '), 1516 # 3.18 Zero Width Space U+200b. 1517 (b'\xe2\x80\x8b', 1518 b''), 1519 # 3.19 Non-ASCII multibyte space character U+3000. 1520 (b'\xe3\x80\x80', 1521 b' '), 1522 # 3.20 ASCII control characters U+0010 U+007F. 1523 (b'\x10\x7f', 1524 b'\x10\x7f'), 1525 # 3.21 Non-ASCII 8bit control character U+0085. 1526 (b'\xc2\x85', 1527 None), 1528 # 3.22 Non-ASCII multibyte control character U+180E. 1529 (b'\xe1\xa0\x8e', 1530 None), 1531 # 3.23 Zero Width No-Break Space U+FEFF. 1532 (b'\xef\xbb\xbf', 1533 b''), 1534 # 3.24 Non-ASCII control character U+1D175. 1535 (b'\xf0\x9d\x85\xb5', 1536 None), 1537 # 3.25 Plane 0 private use character U+F123. 1538 (b'\xef\x84\xa3', 1539 None), 1540 # 3.26 Plane 15 private use character U+F1234. 1541 (b'\xf3\xb1\x88\xb4', 1542 None), 1543 # 3.27 Plane 16 private use character U+10F234. 1544 (b'\xf4\x8f\x88\xb4', 1545 None), 1546 # 3.28 Non-character code point U+8FFFE. 1547 (b'\xf2\x8f\xbf\xbe', 1548 None), 1549 # 3.29 Non-character code point U+10FFFF. 1550 (b'\xf4\x8f\xbf\xbf', 1551 None), 1552 # 3.30 Surrogate code U+DF42. 1553 (b'\xed\xbd\x82', 1554 None), 1555 # 3.31 Non-plain text character U+FFFD. 1556 (b'\xef\xbf\xbd', 1557 None), 1558 # 3.32 Ideographic description character U+2FF5. 1559 (b'\xe2\xbf\xb5', 1560 None), 1561 # 3.33 Display property character U+0341. 1562 (b'\xcd\x81', 1563 b'\xcc\x81'), 1564 # 3.34 Left-to-right mark U+200E. 1565 (b'\xe2\x80\x8e', 1566 None), 1567 # 3.35 Deprecated U+202A. 1568 (b'\xe2\x80\xaa', 1569 None), 1570 # 3.36 Language tagging character U+E0001. 1571 (b'\xf3\xa0\x80\x81', 1572 None), 1573 # 3.37 Language tagging character U+E0042. 1574 (b'\xf3\xa0\x81\x82', 1575 None), 1576 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1577 (b'foo\xd6\xbebar', 1578 None), 1579 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1580 (b'foo\xef\xb5\x90bar', 1581 None), 1582 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1583 (b'foo\xef\xb9\xb6bar', 1584 b'foo \xd9\x8ebar'), 1585 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1586 (b'\xd8\xa71', 1587 None), 1588 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1589 (b'\xd8\xa71\xd8\xa8', 1590 b'\xd8\xa71\xd8\xa8'), 1591 # 3.43 Unassigned code point U+E0002. 1592 # Skip this test as we allow unassigned 1593 #(b'\xf3\xa0\x80\x82', 1594 # None), 1595 (None, None), 1596 # 3.44 Larger test (shrinking). 1597 # Original test case reads \xc3\xdf 1598 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1599 b'\xaa\xce\xb0\xe2\x80\x80', 1600 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1601 # 3.45 Larger test (expanding). 1602 # Original test case reads \xc3\x9f 1603 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1604 b'\x80', 1605 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1606 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1607 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1608 ] 1609 1610 1611 class NameprepTest(unittest.TestCase): 1612 def test_nameprep(self): 1613 from encodings.idna import nameprep 1614 for pos, (orig, prepped) in enumerate(nameprep_tests): 1615 if orig is None: 1616 # Skipped 1617 continue 1618 # The Unicode strings are given in UTF-8 1619 orig = str(orig, "utf-8", "surrogatepass") 1620 if prepped is None: 1621 # Input contains prohibited characters 1622 self.assertRaises(UnicodeError, nameprep, orig) 1623 else: 1624 prepped = str(prepped, "utf-8", "surrogatepass") 1625 try: 1626 self.assertEqual(nameprep(orig), prepped) 1627 except Exception as e: 1628 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1629 1630 1631 class IDNACodecTest(unittest.TestCase): 1632 def test_builtin_decode(self): 1633 self.assertEqual(str(b"python.org", "idna"), "python.org") 1634 self.assertEqual(str(b"python.org.", "idna"), "python.org.") 1635 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org") 1636 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.") 1637 1638 def test_builtin_encode(self): 1639 self.assertEqual("python.org".encode("idna"), b"python.org") 1640 self.assertEqual("python.org.".encode("idna"), b"python.org.") 1641 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org") 1642 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.") 1643 1644 def test_stream(self): 1645 r = codecs.getreader("idna")(io.BytesIO(b"abc")) 1646 r.read(3) 1647 self.assertEqual(r.read(), "") 1648 1649 def test_incremental_decode(self): 1650 self.assertEqual( 1651 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")), 1652 "python.org" 1653 ) 1654 self.assertEqual( 1655 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")), 1656 "python.org." 1657 ) 1658 self.assertEqual( 1659 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1660 "pyth\xf6n.org." 1661 ) 1662 self.assertEqual( 1663 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")), 1664 "pyth\xf6n.org." 1665 ) 1666 1667 decoder = codecs.getincrementaldecoder("idna")() 1668 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1669 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1670 self.assertEqual(decoder.decode(b"rg"), "") 1671 self.assertEqual(decoder.decode(b"", True), "org") 1672 1673 decoder.reset() 1674 self.assertEqual(decoder.decode(b"xn--xam", ), "") 1675 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.") 1676 self.assertEqual(decoder.decode(b"rg."), "org.") 1677 self.assertEqual(decoder.decode(b"", True), "") 1678 1679 def test_incremental_encode(self): 1680 self.assertEqual( 1681 b"".join(codecs.iterencode("python.org", "idna")), 1682 b"python.org" 1683 ) 1684 self.assertEqual( 1685 b"".join(codecs.iterencode("python.org.", "idna")), 1686 b"python.org." 1687 ) 1688 self.assertEqual( 1689 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1690 b"xn--pythn-mua.org." 1691 ) 1692 self.assertEqual( 1693 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")), 1694 b"xn--pythn-mua.org." 1695 ) 1696 1697 encoder = codecs.getincrementalencoder("idna")() 1698 self.assertEqual(encoder.encode("\xe4x"), b"") 1699 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.") 1700 self.assertEqual(encoder.encode("", True), b"org") 1701 1702 encoder.reset() 1703 self.assertEqual(encoder.encode("\xe4x"), b"") 1704 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.") 1705 self.assertEqual(encoder.encode("", True), b"") 1706 1707 def test_errors(self): 1708 """Only supports "strict" error handler""" 1709 "python.org".encode("idna", "strict") 1710 b"python.org".decode("idna", "strict") 1711 for errors in ("ignore", "replace", "backslashreplace", 1712 "surrogateescape"): 1713 self.assertRaises(Exception, "python.org".encode, "idna", errors) 1714 self.assertRaises(Exception, 1715 b"python.org".decode, "idna", errors) 1716 1717 1718 class CodecsModuleTest(unittest.TestCase): 1719 1720 def test_decode(self): 1721 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'), 1722 '\xe4\xf6\xfc') 1723 self.assertRaises(TypeError, codecs.decode) 1724 self.assertEqual(codecs.decode(b'abc'), 'abc') 1725 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') 1726 1727 # test keywords 1728 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'), 1729 '\xe4\xf6\xfc') 1730 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'), 1731 '[]') 1732 1733 def test_encode(self): 1734 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'), 1735 b'\xe4\xf6\xfc') 1736 self.assertRaises(TypeError, codecs.encode) 1737 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1738 self.assertEqual(codecs.encode('abc'), b'abc') 1739 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii') 1740 1741 # test keywords 1742 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'), 1743 b'\xe4\xf6\xfc') 1744 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'), 1745 b'[]') 1746 1747 def test_register(self): 1748 self.assertRaises(TypeError, codecs.register) 1749 self.assertRaises(TypeError, codecs.register, 42) 1750 1751 def test_lookup(self): 1752 self.assertRaises(TypeError, codecs.lookup) 1753 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1754 self.assertRaises(LookupError, codecs.lookup, " ") 1755 1756 def test_getencoder(self): 1757 self.assertRaises(TypeError, codecs.getencoder) 1758 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1759 1760 def test_getdecoder(self): 1761 self.assertRaises(TypeError, codecs.getdecoder) 1762 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1763 1764 def test_getreader(self): 1765 self.assertRaises(TypeError, codecs.getreader) 1766 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1767 1768 def test_getwriter(self): 1769 self.assertRaises(TypeError, codecs.getwriter) 1770 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1771 1772 def test_lookup_issue1813(self): 1773 # Issue #1813: under Turkish locales, lookup of some codecs failed 1774 # because 'I' is lowercased as "" (dotless i) 1775 oldlocale = locale.setlocale(locale.LC_CTYPE) 1776 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1777 try: 1778 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1779 except locale.Error: 1780 # Unsupported locale on this system 1781 self.skipTest('test needs Turkish locale') 1782 c = codecs.lookup('ASCII') 1783 self.assertEqual(c.name, 'ascii') 1784 1785 def test_all(self): 1786 api = ( 1787 "encode", "decode", 1788 "register", "CodecInfo", "Codec", "IncrementalEncoder", 1789 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", 1790 "getencoder", "getdecoder", "getincrementalencoder", 1791 "getincrementaldecoder", "getreader", "getwriter", 1792 "register_error", "lookup_error", 1793 "strict_errors", "replace_errors", "ignore_errors", 1794 "xmlcharrefreplace_errors", "backslashreplace_errors", 1795 "namereplace_errors", 1796 "open", "EncodedFile", 1797 "iterencode", "iterdecode", 1798 "BOM", "BOM_BE", "BOM_LE", 1799 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", 1800 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", 1801 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented 1802 "StreamReaderWriter", "StreamRecoder", 1803 ) 1804 self.assertCountEqual(api, codecs.__all__) 1805 for api in codecs.__all__: 1806 getattr(codecs, api) 1807 1808 def test_open(self): 1809 self.addCleanup(support.unlink, support.TESTFN) 1810 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'): 1811 with self.subTest(mode), \ 1812 codecs.open(support.TESTFN, mode, 'ascii') as file: 1813 self.assertIsInstance(file, codecs.StreamReaderWriter) 1814 1815 def test_undefined(self): 1816 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined') 1817 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined') 1818 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined') 1819 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined') 1820 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'): 1821 self.assertRaises(UnicodeError, 1822 codecs.encode, 'abc', 'undefined', errors) 1823 self.assertRaises(UnicodeError, 1824 codecs.decode, b'abc', 'undefined', errors) 1825 1826 1827 class StreamReaderTest(unittest.TestCase): 1828 1829 def setUp(self): 1830 self.reader = codecs.getreader('utf-8') 1831 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1832 1833 def test_readlines(self): 1834 f = self.reader(self.stream) 1835 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) 1836 1837 1838 class EncodedFileTest(unittest.TestCase): 1839 1840 def test_basic(self): 1841 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80') 1842 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1843 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae') 1844 1845 f = io.BytesIO() 1846 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1') 1847 ef.write(b'\xc3\xbc') 1848 self.assertEqual(f.getvalue(), b'\xfc') 1849 1850 all_unicode_encodings = [ 1851 "ascii", 1852 "big5", 1853 "big5hkscs", 1854 "charmap", 1855 "cp037", 1856 "cp1006", 1857 "cp1026", 1858 "cp1125", 1859 "cp1140", 1860 "cp1250", 1861 "cp1251", 1862 "cp1252", 1863 "cp1253", 1864 "cp1254", 1865 "cp1255", 1866 "cp1256", 1867 "cp1257", 1868 "cp1258", 1869 "cp424", 1870 "cp437", 1871 "cp500", 1872 "cp720", 1873 "cp737", 1874 "cp775", 1875 "cp850", 1876 "cp852", 1877 "cp855", 1878 "cp856", 1879 "cp857", 1880 "cp858", 1881 "cp860", 1882 "cp861", 1883 "cp862", 1884 "cp863", 1885 "cp864", 1886 "cp865", 1887 "cp866", 1888 "cp869", 1889 "cp874", 1890 "cp875", 1891 "cp932", 1892 "cp949", 1893 "cp950", 1894 "euc_jis_2004", 1895 "euc_jisx0213", 1896 "euc_jp", 1897 "euc_kr", 1898 "gb18030", 1899 "gb2312", 1900 "gbk", 1901 "hp_roman8", 1902 "hz", 1903 "idna", 1904 "iso2022_jp", 1905 "iso2022_jp_1", 1906 "iso2022_jp_2", 1907 "iso2022_jp_2004", 1908 "iso2022_jp_3", 1909 "iso2022_jp_ext", 1910 "iso2022_kr", 1911 "iso8859_1", 1912 "iso8859_10", 1913 "iso8859_11", 1914 "iso8859_13", 1915 "iso8859_14", 1916 "iso8859_15", 1917 "iso8859_16", 1918 "iso8859_2", 1919 "iso8859_3", 1920 "iso8859_4", 1921 "iso8859_5", 1922 "iso8859_6", 1923 "iso8859_7", 1924 "iso8859_8", 1925 "iso8859_9", 1926 "johab", 1927 "koi8_r", 1928 "koi8_t", 1929 "koi8_u", 1930 "kz1048", 1931 "latin_1", 1932 "mac_cyrillic", 1933 "mac_greek", 1934 "mac_iceland", 1935 "mac_latin2", 1936 "mac_roman", 1937 "mac_turkish", 1938 "palmos", 1939 "ptcp154", 1940 "punycode", 1941 "raw_unicode_escape", 1942 "shift_jis", 1943 "shift_jis_2004", 1944 "shift_jisx0213", 1945 "tis_620", 1946 "unicode_escape", 1947 "unicode_internal", 1948 "utf_16", 1949 "utf_16_be", 1950 "utf_16_le", 1951 "utf_7", 1952 "utf_8", 1953 ] 1954 1955 if hasattr(codecs, "mbcs_encode"): 1956 all_unicode_encodings.append("mbcs") 1957 if hasattr(codecs, "oem_encode"): 1958 all_unicode_encodings.append("oem") 1959 1960 # The following encoding is not tested, because it's not supposed 1961 # to work: 1962 # "undefined" 1963 1964 # The following encodings don't work in stateful mode 1965 broken_unicode_with_stateful = [ 1966 "punycode", 1967 "unicode_internal" 1968 ] 1969 1970 1971 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): 1972 def test_basics(self): 1973 s = "abc123" # all codecs should be able to encode these 1974 for encoding in all_unicode_encodings: 1975 name = codecs.lookup(encoding).name 1976 if encoding.endswith("_codec"): 1977 name += "_codec" 1978 elif encoding == "latin_1": 1979 name = "latin_1" 1980 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) 1981 1982 with support.check_warnings(): 1983 # unicode-internal has been deprecated 1984 (b, size) = codecs.getencoder(encoding)(s) 1985 self.assertEqual(size, len(s), "encoding=%r" % encoding) 1986 (chars, size) = codecs.getdecoder(encoding)(b) 1987 self.assertEqual(chars, s, "encoding=%r" % encoding) 1988 1989 if encoding not in broken_unicode_with_stateful: 1990 # check stream reader/writer 1991 q = Queue(b"") 1992 writer = codecs.getwriter(encoding)(q) 1993 encodedresult = b"" 1994 for c in s: 1995 writer.write(c) 1996 chunk = q.read() 1997 self.assertTrue(type(chunk) is bytes, type(chunk)) 1998 encodedresult += chunk 1999 q = Queue(b"") 2000 reader = codecs.getreader(encoding)(q) 2001 decodedresult = "" 2002 for c in encodedresult: 2003 q.write(bytes([c])) 2004 decodedresult += reader.read() 2005 self.assertEqual(decodedresult, s, "encoding=%r" % encoding) 2006 2007 if encoding not in broken_unicode_with_stateful: 2008 # check incremental decoder/encoder and iterencode()/iterdecode() 2009 try: 2010 encoder = codecs.getincrementalencoder(encoding)() 2011 except LookupError: # no IncrementalEncoder 2012 pass 2013 else: 2014 # check incremental decoder/encoder 2015 encodedresult = b"" 2016 for c in s: 2017 encodedresult += encoder.encode(c) 2018 encodedresult += encoder.encode("", True) 2019 decoder = codecs.getincrementaldecoder(encoding)() 2020 decodedresult = "" 2021 for c in encodedresult: 2022 decodedresult += decoder.decode(bytes([c])) 2023 decodedresult += decoder.decode(b"", True) 2024 self.assertEqual(decodedresult, s, 2025 "encoding=%r" % encoding) 2026 2027 # check iterencode()/iterdecode() 2028 result = "".join(codecs.iterdecode( 2029 codecs.iterencode(s, encoding), encoding)) 2030 self.assertEqual(result, s, "encoding=%r" % encoding) 2031 2032 # check iterencode()/iterdecode() with empty string 2033 result = "".join(codecs.iterdecode( 2034 codecs.iterencode("", encoding), encoding)) 2035 self.assertEqual(result, "") 2036 2037 if encoding not in ("idna", "mbcs"): 2038 # check incremental decoder/encoder with errors argument 2039 try: 2040 encoder = codecs.getincrementalencoder(encoding)("ignore") 2041 except LookupError: # no IncrementalEncoder 2042 pass 2043 else: 2044 encodedresult = b"".join(encoder.encode(c) for c in s) 2045 decoder = codecs.getincrementaldecoder(encoding)("ignore") 2046 decodedresult = "".join(decoder.decode(bytes([c])) 2047 for c in encodedresult) 2048 self.assertEqual(decodedresult, s, 2049 "encoding=%r" % encoding) 2050 2051 @support.cpython_only 2052 def test_basics_capi(self): 2053 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder 2054 s = "abc123" # all codecs should be able to encode these 2055 for encoding in all_unicode_encodings: 2056 if encoding not in broken_unicode_with_stateful: 2057 # check incremental decoder/encoder (fetched via the C API) 2058 try: 2059 cencoder = codec_incrementalencoder(encoding) 2060 except LookupError: # no IncrementalEncoder 2061 pass 2062 else: 2063 # check C API 2064 encodedresult = b"" 2065 for c in s: 2066 encodedresult += cencoder.encode(c) 2067 encodedresult += cencoder.encode("", True) 2068 cdecoder = codec_incrementaldecoder(encoding) 2069 decodedresult = "" 2070 for c in encodedresult: 2071 decodedresult += cdecoder.decode(bytes([c])) 2072 decodedresult += cdecoder.decode(b"", True) 2073 self.assertEqual(decodedresult, s, 2074 "encoding=%r" % encoding) 2075 2076 if encoding not in ("idna", "mbcs"): 2077 # check incremental decoder/encoder with errors argument 2078 try: 2079 cencoder = codec_incrementalencoder(encoding, "ignore") 2080 except LookupError: # no IncrementalEncoder 2081 pass 2082 else: 2083 encodedresult = b"".join(cencoder.encode(c) for c in s) 2084 cdecoder = codec_incrementaldecoder(encoding, "ignore") 2085 decodedresult = "".join(cdecoder.decode(bytes([c])) 2086 for c in encodedresult) 2087 self.assertEqual(decodedresult, s, 2088 "encoding=%r" % encoding) 2089 2090 def test_seek(self): 2091 # all codecs should be able to encode these 2092 s = "%s\n%s\n" % (100*"abc123", 100*"def456") 2093 for encoding in all_unicode_encodings: 2094 if encoding == "idna": # FIXME: See SF bug #1163178 2095 continue 2096 if encoding in broken_unicode_with_stateful: 2097 continue 2098 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding))) 2099 for t in range(5): 2100 # Test that calling seek resets the internal codec state and buffers 2101 reader.seek(0, 0) 2102 data = reader.read() 2103 self.assertEqual(s, data) 2104 2105 def test_bad_decode_args(self): 2106 for encoding in all_unicode_encodings: 2107 decoder = codecs.getdecoder(encoding) 2108 self.assertRaises(TypeError, decoder) 2109 if encoding not in ("idna", "punycode"): 2110 self.assertRaises(TypeError, decoder, 42) 2111 2112 def test_bad_encode_args(self): 2113 for encoding in all_unicode_encodings: 2114 encoder = codecs.getencoder(encoding) 2115 with support.check_warnings(): 2116 # unicode-internal has been deprecated 2117 self.assertRaises(TypeError, encoder) 2118 2119 def test_encoding_map_type_initialized(self): 2120 from encodings import cp1140 2121 # This used to crash, we are only verifying there's no crash. 2122 table_type = type(cp1140.encoding_table) 2123 self.assertEqual(table_type, table_type) 2124 2125 def test_decoder_state(self): 2126 # Check that getstate() and setstate() handle the state properly 2127 u = "abc123" 2128 for encoding in all_unicode_encodings: 2129 if encoding not in broken_unicode_with_stateful: 2130 self.check_state_handling_decode(encoding, u, u.encode(encoding)) 2131 self.check_state_handling_encode(encoding, u, u.encode(encoding)) 2132 2133 2134 class CharmapTest(unittest.TestCase): 2135 def test_decode_with_string_map(self): 2136 self.assertEqual( 2137 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), 2138 ("abc", 3) 2139 ) 2140 2141 self.assertEqual( 2142 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"), 2143 ("\U0010FFFFbc", 3) 2144 ) 2145 2146 self.assertRaises(UnicodeDecodeError, 2147 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab" 2148 ) 2149 2150 self.assertRaises(UnicodeDecodeError, 2151 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe" 2152 ) 2153 2154 self.assertEqual( 2155 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"), 2156 ("ab\ufffd", 3) 2157 ) 2158 2159 self.assertEqual( 2160 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"), 2161 ("ab\ufffd", 3) 2162 ) 2163 2164 self.assertEqual( 2165 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"), 2166 ("ab\\x02", 3) 2167 ) 2168 2169 self.assertEqual( 2170 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"), 2171 ("ab\\x02", 3) 2172 ) 2173 2174 self.assertEqual( 2175 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"), 2176 ("ab", 3) 2177 ) 2178 2179 self.assertEqual( 2180 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"), 2181 ("ab", 3) 2182 ) 2183 2184 allbytes = bytes(range(256)) 2185 self.assertEqual( 2186 codecs.charmap_decode(allbytes, "ignore", ""), 2187 ("", len(allbytes)) 2188 ) 2189 2190 def test_decode_with_int2str_map(self): 2191 self.assertEqual( 2192 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2193 {0: 'a', 1: 'b', 2: 'c'}), 2194 ("abc", 3) 2195 ) 2196 2197 self.assertEqual( 2198 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2199 {0: 'Aa', 1: 'Bb', 2: 'Cc'}), 2200 ("AaBbCc", 3) 2201 ) 2202 2203 self.assertEqual( 2204 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2205 {0: '\U0010FFFF', 1: 'b', 2: 'c'}), 2206 ("\U0010FFFFbc", 3) 2207 ) 2208 2209 self.assertEqual( 2210 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2211 {0: 'a', 1: 'b', 2: ''}), 2212 ("ab", 3) 2213 ) 2214 2215 self.assertRaises(UnicodeDecodeError, 2216 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2217 {0: 'a', 1: 'b'} 2218 ) 2219 2220 self.assertRaises(UnicodeDecodeError, 2221 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2222 {0: 'a', 1: 'b', 2: None} 2223 ) 2224 2225 # Issue #14850 2226 self.assertRaises(UnicodeDecodeError, 2227 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2228 {0: 'a', 1: 'b', 2: '\ufffe'} 2229 ) 2230 2231 self.assertEqual( 2232 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2233 {0: 'a', 1: 'b'}), 2234 ("ab\ufffd", 3) 2235 ) 2236 2237 self.assertEqual( 2238 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2239 {0: 'a', 1: 'b', 2: None}), 2240 ("ab\ufffd", 3) 2241 ) 2242 2243 # Issue #14850 2244 self.assertEqual( 2245 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2246 {0: 'a', 1: 'b', 2: '\ufffe'}), 2247 ("ab\ufffd", 3) 2248 ) 2249 2250 self.assertEqual( 2251 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2252 {0: 'a', 1: 'b'}), 2253 ("ab\\x02", 3) 2254 ) 2255 2256 self.assertEqual( 2257 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2258 {0: 'a', 1: 'b', 2: None}), 2259 ("ab\\x02", 3) 2260 ) 2261 2262 # Issue #14850 2263 self.assertEqual( 2264 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2265 {0: 'a', 1: 'b', 2: '\ufffe'}), 2266 ("ab\\x02", 3) 2267 ) 2268 2269 self.assertEqual( 2270 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2271 {0: 'a', 1: 'b'}), 2272 ("ab", 3) 2273 ) 2274 2275 self.assertEqual( 2276 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2277 {0: 'a', 1: 'b', 2: None}), 2278 ("ab", 3) 2279 ) 2280 2281 # Issue #14850 2282 self.assertEqual( 2283 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2284 {0: 'a', 1: 'b', 2: '\ufffe'}), 2285 ("ab", 3) 2286 ) 2287 2288 allbytes = bytes(range(256)) 2289 self.assertEqual( 2290 codecs.charmap_decode(allbytes, "ignore", {}), 2291 ("", len(allbytes)) 2292 ) 2293 2294 def test_decode_with_int2int_map(self): 2295 a = ord('a') 2296 b = ord('b') 2297 c = ord('c') 2298 2299 self.assertEqual( 2300 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2301 {0: a, 1: b, 2: c}), 2302 ("abc", 3) 2303 ) 2304 2305 # Issue #15379 2306 self.assertEqual( 2307 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2308 {0: 0x10FFFF, 1: b, 2: c}), 2309 ("\U0010FFFFbc", 3) 2310 ) 2311 2312 self.assertEqual( 2313 codecs.charmap_decode(b"\x00\x01\x02", "strict", 2314 {0: sys.maxunicode, 1: b, 2: c}), 2315 (chr(sys.maxunicode) + "bc", 3) 2316 ) 2317 2318 self.assertRaises(TypeError, 2319 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2320 {0: sys.maxunicode + 1, 1: b, 2: c} 2321 ) 2322 2323 self.assertRaises(UnicodeDecodeError, 2324 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2325 {0: a, 1: b}, 2326 ) 2327 2328 self.assertRaises(UnicodeDecodeError, 2329 codecs.charmap_decode, b"\x00\x01\x02", "strict", 2330 {0: a, 1: b, 2: 0xFFFE}, 2331 ) 2332 2333 self.assertEqual( 2334 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2335 {0: a, 1: b}), 2336 ("ab\ufffd", 3) 2337 ) 2338 2339 self.assertEqual( 2340 codecs.charmap_decode(b"\x00\x01\x02", "replace", 2341 {0: a, 1: b, 2: 0xFFFE}), 2342 ("ab\ufffd", 3) 2343 ) 2344 2345 self.assertEqual( 2346 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2347 {0: a, 1: b}), 2348 ("ab\\x02", 3) 2349 ) 2350 2351 self.assertEqual( 2352 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", 2353 {0: a, 1: b, 2: 0xFFFE}), 2354 ("ab\\x02", 3) 2355 ) 2356 2357 self.assertEqual( 2358 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2359 {0: a, 1: b}), 2360 ("ab", 3) 2361 ) 2362 2363 self.assertEqual( 2364 codecs.charmap_decode(b"\x00\x01\x02", "ignore", 2365 {0: a, 1: b, 2: 0xFFFE}), 2366 ("ab", 3) 2367 ) 2368 2369 2370 class WithStmtTest(unittest.TestCase): 2371 def test_encodedfile(self): 2372 f = io.BytesIO(b"\xc3\xbc") 2373 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 2374 self.assertEqual(ef.read(), b"\xfc") 2375 self.assertTrue(f.closed) 2376 2377 def test_streamreaderwriter(self): 2378 f = io.BytesIO(b"\xc3\xbc") 2379 info = codecs.lookup("utf-8") 2380 with codecs.StreamReaderWriter(f, info.streamreader, 2381 info.streamwriter, 'strict') as srw: 2382 self.assertEqual(srw.read(), "\xfc") 2383 2384 2385 class TypesTest(unittest.TestCase): 2386 def test_decode_unicode(self): 2387 # Most decoders don't accept unicode input 2388 decoders = [ 2389 codecs.utf_7_decode, 2390 codecs.utf_8_decode, 2391 codecs.utf_16_le_decode, 2392 codecs.utf_16_be_decode, 2393 codecs.utf_16_ex_decode, 2394 codecs.utf_32_decode, 2395 codecs.utf_32_le_decode, 2396 codecs.utf_32_be_decode, 2397 codecs.utf_32_ex_decode, 2398 codecs.latin_1_decode, 2399 codecs.ascii_decode, 2400 codecs.charmap_decode, 2401 ] 2402 if hasattr(codecs, "mbcs_decode"): 2403 decoders.append(codecs.mbcs_decode) 2404 for decoder in decoders: 2405 self.assertRaises(TypeError, decoder, "xxx") 2406 2407 def test_unicode_escape(self): 2408 # Escape-decoding a unicode string is supported and gives the same 2409 # result as decoding the equivalent ASCII bytes string. 2410 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2411 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2412 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) 2413 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) 2414 2415 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000") 2416 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2417 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"), 2418 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2419 2420 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") 2421 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) 2422 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"), 2423 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) 2424 2425 2426 class UnicodeEscapeTest(unittest.TestCase): 2427 def test_empty(self): 2428 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) 2429 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0)) 2430 2431 def test_raw_encode(self): 2432 encode = codecs.unicode_escape_encode 2433 for b in range(32, 127): 2434 if b != b'\\'[0]: 2435 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2436 2437 def test_raw_decode(self): 2438 decode = codecs.unicode_escape_decode 2439 for b in range(256): 2440 if b != b'\\'[0]: 2441 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2442 2443 def test_escape_encode(self): 2444 encode = codecs.unicode_escape_encode 2445 check = coding_checker(self, encode) 2446 check('\t', br'\t') 2447 check('\n', br'\n') 2448 check('\r', br'\r') 2449 check('\\', br'\\') 2450 for b in range(32): 2451 if chr(b) not in '\t\n\r': 2452 check(chr(b), ('\\x%02x' % b).encode()) 2453 for b in range(127, 256): 2454 check(chr(b), ('\\x%02x' % b).encode()) 2455 check('\u20ac', br'\u20ac') 2456 check('\U0001d120', br'\U0001d120') 2457 2458 def test_escape_decode(self): 2459 decode = codecs.unicode_escape_decode 2460 check = coding_checker(self, decode) 2461 check(b"[\\\n]", "[]") 2462 check(br'[\"]', '["]') 2463 check(br"[\']", "[']") 2464 check(br"[\\]", r"[\]") 2465 check(br"[\a]", "[\x07]") 2466 check(br"[\b]", "[\x08]") 2467 check(br"[\t]", "[\x09]") 2468 check(br"[\n]", "[\x0a]") 2469 check(br"[\v]", "[\x0b]") 2470 check(br"[\f]", "[\x0c]") 2471 check(br"[\r]", "[\x0d]") 2472 check(br"[\7]", "[\x07]") 2473 check(br"[\78]", "[\x078]") 2474 check(br"[\41]", "[!]") 2475 check(br"[\418]", "[!8]") 2476 check(br"[\101]", "[A]") 2477 check(br"[\1010]", "[A0]") 2478 check(br"[\x41]", "[A]") 2479 check(br"[\x410]", "[A0]") 2480 check(br"\u20ac", "\u20ac") 2481 check(br"\U0001d120", "\U0001d120") 2482 for i in range(97, 123): 2483 b = bytes([i]) 2484 if b not in b'abfnrtuvx': 2485 with self.assertWarns(DeprecationWarning): 2486 check(b"\\" + b, "\\" + chr(i)) 2487 if b.upper() not in b'UN': 2488 with self.assertWarns(DeprecationWarning): 2489 check(b"\\" + b.upper(), "\\" + chr(i-32)) 2490 with self.assertWarns(DeprecationWarning): 2491 check(br"\8", "\\8") 2492 with self.assertWarns(DeprecationWarning): 2493 check(br"\9", "\\9") 2494 with self.assertWarns(DeprecationWarning): 2495 check(b"\\\xfa", "\\\xfa") 2496 2497 def test_decode_errors(self): 2498 decode = codecs.unicode_escape_decode 2499 for c, d in (b'x', 2), (b'u', 4), (b'U', 4): 2500 for i in range(d): 2501 self.assertRaises(UnicodeDecodeError, decode, 2502 b"\\" + c + b"0"*i) 2503 self.assertRaises(UnicodeDecodeError, decode, 2504 b"[\\" + c + b"0"*i + b"]") 2505 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2506 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2507 self.assertEqual(decode(data, "replace"), 2508 ("[\ufffd]\ufffd", len(data))) 2509 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2510 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2511 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2512 2513 2514 class RawUnicodeEscapeTest(unittest.TestCase): 2515 def test_empty(self): 2516 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) 2517 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) 2518 2519 def test_raw_encode(self): 2520 encode = codecs.raw_unicode_escape_encode 2521 for b in range(256): 2522 self.assertEqual(encode(chr(b)), (bytes([b]), 1)) 2523 2524 def test_raw_decode(self): 2525 decode = codecs.raw_unicode_escape_decode 2526 for b in range(256): 2527 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2)) 2528 2529 def test_escape_encode(self): 2530 encode = codecs.raw_unicode_escape_encode 2531 check = coding_checker(self, encode) 2532 for b in range(256): 2533 if b not in b'uU': 2534 check('\\' + chr(b), b'\\' + bytes([b])) 2535 check('\u20ac', br'\u20ac') 2536 check('\U0001d120', br'\U0001d120') 2537 2538 def test_escape_decode(self): 2539 decode = codecs.raw_unicode_escape_decode 2540 check = coding_checker(self, decode) 2541 for b in range(256): 2542 if b not in b'uU': 2543 check(b'\\' + bytes([b]), '\\' + chr(b)) 2544 check(br"\u20ac", "\u20ac") 2545 check(br"\U0001d120", "\U0001d120") 2546 2547 def test_decode_errors(self): 2548 decode = codecs.raw_unicode_escape_decode 2549 for c, d in (b'u', 4), (b'U', 4): 2550 for i in range(d): 2551 self.assertRaises(UnicodeDecodeError, decode, 2552 b"\\" + c + b"0"*i) 2553 self.assertRaises(UnicodeDecodeError, decode, 2554 b"[\\" + c + b"0"*i + b"]") 2555 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i 2556 self.assertEqual(decode(data, "ignore"), ("[]", len(data))) 2557 self.assertEqual(decode(data, "replace"), 2558 ("[\ufffd]\ufffd", len(data))) 2559 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000") 2560 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) 2561 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) 2562 2563 2564 class EscapeEncodeTest(unittest.TestCase): 2565 2566 def test_escape_encode(self): 2567 tests = [ 2568 (b'', (b'', 0)), 2569 (b'foobar', (b'foobar', 6)), 2570 (b'spam\0eggs', (b'spam\\x00eggs', 9)), 2571 (b'a\'b', (b"a\\'b", 3)), 2572 (b'b\\c', (b'b\\\\c', 3)), 2573 (b'c\nd', (b'c\\nd', 3)), 2574 (b'd\re', (b'd\\re', 3)), 2575 (b'f\x7fg', (b'f\\x7fg', 3)), 2576 ] 2577 for data, output in tests: 2578 with self.subTest(data=data): 2579 self.assertEqual(codecs.escape_encode(data), output) 2580 self.assertRaises(TypeError, codecs.escape_encode, 'spam') 2581 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam')) 2582 2583 2584 class SurrogateEscapeTest(unittest.TestCase): 2585 2586 def test_utf8(self): 2587 # Bad byte 2588 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"), 2589 "foo\udc80bar") 2590 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"), 2591 b"foo\x80bar") 2592 # bad-utf-8 encoded surrogate 2593 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"), 2594 "\udced\udcb0\udc80") 2595 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"), 2596 b"\xed\xb0\x80") 2597 2598 def test_ascii(self): 2599 # bad byte 2600 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"), 2601 "foo\udc80bar") 2602 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), 2603 b"foo\x80bar") 2604 2605 def test_charmap(self): 2606 # bad byte: \xa5 is unmapped in iso-8859-3 2607 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), 2608 "foo\udca5bar") 2609 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"), 2610 b"foo\xa5bar") 2611 2612 def test_latin1(self): 2613 # Issue6373 2614 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"), 2615 b"\xe4\xeb\xef\xf6\xfc") 2616 2617 2618 class BomTest(unittest.TestCase): 2619 def test_seek0(self): 2620 data = "1234567890" 2621 tests = ("utf-16", 2622 "utf-16-le", 2623 "utf-16-be", 2624 "utf-32", 2625 "utf-32-le", 2626 "utf-32-be") 2627 self.addCleanup(support.unlink, support.TESTFN) 2628 for encoding in tests: 2629 # Check if the BOM is written only once 2630 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2631 f.write(data) 2632 f.write(data) 2633 f.seek(0) 2634 self.assertEqual(f.read(), data * 2) 2635 f.seek(0) 2636 self.assertEqual(f.read(), data * 2) 2637 2638 # Check that the BOM is written after a seek(0) 2639 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2640 f.write(data[0]) 2641 self.assertNotEqual(f.tell(), 0) 2642 f.seek(0) 2643 f.write(data) 2644 f.seek(0) 2645 self.assertEqual(f.read(), data) 2646 2647 # (StreamWriter) Check that the BOM is written after a seek(0) 2648 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2649 f.writer.write(data[0]) 2650 self.assertNotEqual(f.writer.tell(), 0) 2651 f.writer.seek(0) 2652 f.writer.write(data) 2653 f.seek(0) 2654 self.assertEqual(f.read(), data) 2655 2656 # Check that the BOM is not written after a seek() at a position 2657 # different than the start 2658 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2659 f.write(data) 2660 f.seek(f.tell()) 2661 f.write(data) 2662 f.seek(0) 2663 self.assertEqual(f.read(), data * 2) 2664 2665 # (StreamWriter) Check that the BOM is not written after a seek() 2666 # at a position different than the start 2667 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f: 2668 f.writer.write(data) 2669 f.writer.seek(f.writer.tell()) 2670 f.writer.write(data) 2671 f.seek(0) 2672 self.assertEqual(f.read(), data * 2) 2673 2674 2675 bytes_transform_encodings = [ 2676 "base64_codec", 2677 "uu_codec", 2678 "quopri_codec", 2679 "hex_codec", 2680 ] 2681 2682 transform_aliases = { 2683 "base64_codec": ["base64", "base_64"], 2684 "uu_codec": ["uu"], 2685 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"], 2686 "hex_codec": ["hex"], 2687 "rot_13": ["rot13"], 2688 } 2689 2690 try: 2691 import zlib 2692 except ImportError: 2693 zlib = None 2694 else: 2695 bytes_transform_encodings.append("zlib_codec") 2696 transform_aliases["zlib_codec"] = ["zip", "zlib"] 2697 try: 2698 import bz2 2699 except ImportError: 2700 pass 2701 else: 2702 bytes_transform_encodings.append("bz2_codec") 2703 transform_aliases["bz2_codec"] = ["bz2"] 2704 2705 2706 class TransformCodecTest(unittest.TestCase): 2707 2708 def test_basics(self): 2709 binput = bytes(range(256)) 2710 for encoding in bytes_transform_encodings: 2711 with self.subTest(encoding=encoding): 2712 # generic codecs interface 2713 (o, size) = codecs.getencoder(encoding)(binput) 2714 self.assertEqual(size, len(binput)) 2715 (i, size) = codecs.getdecoder(encoding)(o) 2716 self.assertEqual(size, len(o)) 2717 self.assertEqual(i, binput) 2718 2719 def test_read(self): 2720 for encoding in bytes_transform_encodings: 2721 with self.subTest(encoding=encoding): 2722 sin = codecs.encode(b"\x80", encoding) 2723 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2724 sout = reader.read() 2725 self.assertEqual(sout, b"\x80") 2726 2727 def test_readline(self): 2728 for encoding in bytes_transform_encodings: 2729 with self.subTest(encoding=encoding): 2730 sin = codecs.encode(b"\x80", encoding) 2731 reader = codecs.getreader(encoding)(io.BytesIO(sin)) 2732 sout = reader.readline() 2733 self.assertEqual(sout, b"\x80") 2734 2735 def test_buffer_api_usage(self): 2736 # We check all the transform codecs accept memoryview input 2737 # for encoding and decoding 2738 # and also that they roundtrip correctly 2739 original = b"12345\x80" 2740 for encoding in bytes_transform_encodings: 2741 with self.subTest(encoding=encoding): 2742 data = original 2743 view = memoryview(data) 2744 data = codecs.encode(data, encoding) 2745 view_encoded = codecs.encode(view, encoding) 2746 self.assertEqual(view_encoded, data) 2747 view = memoryview(data) 2748 data = codecs.decode(data, encoding) 2749 self.assertEqual(data, original) 2750 view_decoded = codecs.decode(view, encoding) 2751 self.assertEqual(view_decoded, data) 2752 2753 def test_text_to_binary_blacklists_binary_transforms(self): 2754 # Check binary -> binary codecs give a good error for str input 2755 bad_input = "bad input type" 2756 for encoding in bytes_transform_encodings: 2757 with self.subTest(encoding=encoding): 2758 fmt = (r"{!r} is not a text encoding; " 2759 r"use codecs.encode\(\) to handle arbitrary codecs") 2760 msg = fmt.format(encoding) 2761 with self.assertRaisesRegex(LookupError, msg) as failure: 2762 bad_input.encode(encoding) 2763 self.assertIsNone(failure.exception.__cause__) 2764 2765 def test_text_to_binary_blacklists_text_transforms(self): 2766 # Check str.encode gives a good error message for str -> str codecs 2767 msg = (r"^'rot_13' is not a text encoding; " 2768 r"use codecs.encode\(\) to handle arbitrary codecs") 2769 with self.assertRaisesRegex(LookupError, msg): 2770 "just an example message".encode("rot_13") 2771 2772 def test_binary_to_text_blacklists_binary_transforms(self): 2773 # Check bytes.decode and bytearray.decode give a good error 2774 # message for binary -> binary codecs 2775 data = b"encode first to ensure we meet any format restrictions" 2776 for encoding in bytes_transform_encodings: 2777 with self.subTest(encoding=encoding): 2778 encoded_data = codecs.encode(data, encoding) 2779 fmt = (r"{!r} is not a text encoding; " 2780 r"use codecs.decode\(\) to handle arbitrary codecs") 2781 msg = fmt.format(encoding) 2782 with self.assertRaisesRegex(LookupError, msg): 2783 encoded_data.decode(encoding) 2784 with self.assertRaisesRegex(LookupError, msg): 2785 bytearray(encoded_data).decode(encoding) 2786 2787 def test_binary_to_text_blacklists_text_transforms(self): 2788 # Check str -> str codec gives a good error for binary input 2789 for bad_input in (b"immutable", bytearray(b"mutable")): 2790 with self.subTest(bad_input=bad_input): 2791 msg = (r"^'rot_13' is not a text encoding; " 2792 r"use codecs.decode\(\) to handle arbitrary codecs") 2793 with self.assertRaisesRegex(LookupError, msg) as failure: 2794 bad_input.decode("rot_13") 2795 self.assertIsNone(failure.exception.__cause__) 2796 2797 @unittest.skipUnless(zlib, "Requires zlib support") 2798 def test_custom_zlib_error_is_wrapped(self): 2799 # Check zlib codec gives a good error for malformed input 2800 msg = "^decoding with 'zlib_codec' codec failed" 2801 with self.assertRaisesRegex(Exception, msg) as failure: 2802 codecs.decode(b"hello", "zlib_codec") 2803 self.assertIsInstance(failure.exception.__cause__, 2804 type(failure.exception)) 2805 2806 def test_custom_hex_error_is_wrapped(self): 2807 # Check hex codec gives a good error for malformed input 2808 msg = "^decoding with 'hex_codec' codec failed" 2809 with self.assertRaisesRegex(Exception, msg) as failure: 2810 codecs.decode(b"hello", "hex_codec") 2811 self.assertIsInstance(failure.exception.__cause__, 2812 type(failure.exception)) 2813 2814 # Unfortunately, the bz2 module throws OSError, which the codec 2815 # machinery currently can't wrap :( 2816 2817 # Ensure codec aliases from http://bugs.python.org/issue7475 work 2818 def test_aliases(self): 2819 for codec_name, aliases in transform_aliases.items(): 2820 expected_name = codecs.lookup(codec_name).name 2821 for alias in aliases: 2822 with self.subTest(alias=alias): 2823 info = codecs.lookup(alias) 2824 self.assertEqual(info.name, expected_name) 2825 2826 def test_quopri_stateless(self): 2827 # Should encode with quotetabs=True 2828 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec") 2829 self.assertEqual(encoded, b"space=20tab=09eol=20\n") 2830 # But should still support unescaped tabs and spaces 2831 unescaped = b"space tab eol\n" 2832 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped) 2833 2834 def test_uu_invalid(self): 2835 # Missing "begin" line 2836 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec") 2837 2838 2839 # The codec system tries to wrap exceptions in order to ensure the error 2840 # mentions the operation being performed and the codec involved. We 2841 # currently *only* want this to happen for relatively stateless 2842 # exceptions, where the only significant information they contain is their 2843 # type and a single str argument. 2844 2845 # Use a local codec registry to avoid appearing to leak objects when 2846 # registering multiple search functions 2847 _TEST_CODECS = {} 2848 2849 def _get_test_codec(codec_name): 2850 return _TEST_CODECS.get(codec_name) 2851 codecs.register(_get_test_codec) # Returns None, not usable as a decorator 2852 2853 try: 2854 # Issue #22166: Also need to clear the internal cache in CPython 2855 from _codecs import _forget_codec 2856 except ImportError: 2857 def _forget_codec(codec_name): 2858 pass 2859 2860 2861 class ExceptionChainingTest(unittest.TestCase): 2862 2863 def setUp(self): 2864 # There's no way to unregister a codec search function, so we just 2865 # ensure we render this one fairly harmless after the test 2866 # case finishes by using the test case repr as the codec name 2867 # The codecs module normalizes codec names, although this doesn't 2868 # appear to be formally documented... 2869 # We also make sure we use a truly unique id for the custom codec 2870 # to avoid issues with the codec cache when running these tests 2871 # multiple times (e.g. when hunting for refleaks) 2872 unique_id = repr(self) + str(id(self)) 2873 self.codec_name = encodings.normalize_encoding(unique_id).lower() 2874 2875 # We store the object to raise on the instance because of a bad 2876 # interaction between the codec caching (which means we can't 2877 # recreate the codec entry) and regrtest refleak hunting (which 2878 # runs the same test instance multiple times). This means we 2879 # need to ensure the codecs call back in to the instance to find 2880 # out which exception to raise rather than binding them in a 2881 # closure to an object that may change on the next run 2882 self.obj_to_raise = RuntimeError 2883 2884 def tearDown(self): 2885 _TEST_CODECS.pop(self.codec_name, None) 2886 # Issue #22166: Also pop from caches to avoid appearance of ref leaks 2887 encodings._cache.pop(self.codec_name, None) 2888 try: 2889 _forget_codec(self.codec_name) 2890 except KeyError: 2891 pass 2892 2893 def set_codec(self, encode, decode): 2894 codec_info = codecs.CodecInfo(encode, decode, 2895 name=self.codec_name) 2896 _TEST_CODECS[self.codec_name] = codec_info 2897 2898 @contextlib.contextmanager 2899 def assertWrapped(self, operation, exc_type, msg): 2900 full_msg = r"{} with {!r} codec failed \({}: {}\)".format( 2901 operation, self.codec_name, exc_type.__name__, msg) 2902 with self.assertRaisesRegex(exc_type, full_msg) as caught: 2903 yield caught 2904 self.assertIsInstance(caught.exception.__cause__, exc_type) 2905 self.assertIsNotNone(caught.exception.__cause__.__traceback__) 2906 2907 def raise_obj(self, *args, **kwds): 2908 # Helper to dynamically change the object raised by a test codec 2909 raise self.obj_to_raise 2910 2911 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError): 2912 self.obj_to_raise = obj_to_raise 2913 self.set_codec(self.raise_obj, self.raise_obj) 2914 with self.assertWrapped("encoding", exc_type, msg): 2915 "str_input".encode(self.codec_name) 2916 with self.assertWrapped("encoding", exc_type, msg): 2917 codecs.encode("str_input", self.codec_name) 2918 with self.assertWrapped("decoding", exc_type, msg): 2919 b"bytes input".decode(self.codec_name) 2920 with self.assertWrapped("decoding", exc_type, msg): 2921 codecs.decode(b"bytes input", self.codec_name) 2922 2923 def test_raise_by_type(self): 2924 self.check_wrapped(RuntimeError, "") 2925 2926 def test_raise_by_value(self): 2927 msg = "This should be wrapped" 2928 self.check_wrapped(RuntimeError(msg), msg) 2929 2930 def test_raise_grandchild_subclass_exact_size(self): 2931 msg = "This should be wrapped" 2932 class MyRuntimeError(RuntimeError): 2933 __slots__ = () 2934 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2935 2936 def test_raise_subclass_with_weakref_support(self): 2937 msg = "This should be wrapped" 2938 class MyRuntimeError(RuntimeError): 2939 pass 2940 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError) 2941 2942 def check_not_wrapped(self, obj_to_raise, msg): 2943 def raise_obj(*args, **kwds): 2944 raise obj_to_raise 2945 self.set_codec(raise_obj, raise_obj) 2946 with self.assertRaisesRegex(RuntimeError, msg): 2947 "str input".encode(self.codec_name) 2948 with self.assertRaisesRegex(RuntimeError, msg): 2949 codecs.encode("str input", self.codec_name) 2950 with self.assertRaisesRegex(RuntimeError, msg): 2951 b"bytes input".decode(self.codec_name) 2952 with self.assertRaisesRegex(RuntimeError, msg): 2953 codecs.decode(b"bytes input", self.codec_name) 2954 2955 def test_init_override_is_not_wrapped(self): 2956 class CustomInit(RuntimeError): 2957 def __init__(self): 2958 pass 2959 self.check_not_wrapped(CustomInit, "") 2960 2961 def test_new_override_is_not_wrapped(self): 2962 class CustomNew(RuntimeError): 2963 def __new__(cls): 2964 return super().__new__(cls) 2965 self.check_not_wrapped(CustomNew, "") 2966 2967 def test_instance_attribute_is_not_wrapped(self): 2968 msg = "This should NOT be wrapped" 2969 exc = RuntimeError(msg) 2970 exc.attr = 1 2971 self.check_not_wrapped(exc, "^{}$".format(msg)) 2972 2973 def test_non_str_arg_is_not_wrapped(self): 2974 self.check_not_wrapped(RuntimeError(1), "1") 2975 2976 def test_multiple_args_is_not_wrapped(self): 2977 msg_re = r"^\('a', 'b', 'c'\)$" 2978 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re) 2979 2980 # http://bugs.python.org/issue19609 2981 def test_codec_lookup_failure_not_wrapped(self): 2982 msg = "^unknown encoding: {}$".format(self.codec_name) 2983 # The initial codec lookup should not be wrapped 2984 with self.assertRaisesRegex(LookupError, msg): 2985 "str input".encode(self.codec_name) 2986 with self.assertRaisesRegex(LookupError, msg): 2987 codecs.encode("str input", self.codec_name) 2988 with self.assertRaisesRegex(LookupError, msg): 2989 b"bytes input".decode(self.codec_name) 2990 with self.assertRaisesRegex(LookupError, msg): 2991 codecs.decode(b"bytes input", self.codec_name) 2992 2993 def test_unflagged_non_text_codec_handling(self): 2994 # The stdlib non-text codecs are now marked so they're 2995 # pre-emptively skipped by the text model related methods 2996 # However, third party codecs won't be flagged, so we still make 2997 # sure the case where an inappropriate output type is produced is 2998 # handled appropriately 2999 def encode_to_str(*args, **kwds): 3000 return "not bytes!", 0 3001 def decode_to_bytes(*args, **kwds): 3002 return b"not str!", 0 3003 self.set_codec(encode_to_str, decode_to_bytes) 3004 # No input or output type checks on the codecs module functions 3005 encoded = codecs.encode(None, self.codec_name) 3006 self.assertEqual(encoded, "not bytes!") 3007 decoded = codecs.decode(None, self.codec_name) 3008 self.assertEqual(decoded, b"not str!") 3009 # Text model methods should complain 3010 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " 3011 r"use codecs.encode\(\) to encode to arbitrary types$") 3012 msg = fmt.format(self.codec_name) 3013 with self.assertRaisesRegex(TypeError, msg): 3014 "str_input".encode(self.codec_name) 3015 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " 3016 r"use codecs.decode\(\) to decode to arbitrary types$") 3017 msg = fmt.format(self.codec_name) 3018 with self.assertRaisesRegex(TypeError, msg): 3019 b"bytes input".decode(self.codec_name) 3020 3021 3022 3023 @unittest.skipUnless(sys.platform == 'win32', 3024 'code pages are specific to Windows') 3025 class CodePageTest(unittest.TestCase): 3026 # CP_UTF8 is already tested by CP65001Test 3027 CP_UTF8 = 65001 3028 3029 def test_invalid_code_page(self): 3030 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') 3031 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') 3032 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a') 3033 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a') 3034 3035 def test_code_page_name(self): 3036 self.assertRaisesRegex(UnicodeEncodeError, 'cp932', 3037 codecs.code_page_encode, 932, '\xff') 3038 self.assertRaisesRegex(UnicodeDecodeError, 'cp932', 3039 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) 3040 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', 3041 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) 3042 3043 def check_decode(self, cp, tests): 3044 for raw, errors, expected in tests: 3045 if expected is not None: 3046 try: 3047 decoded = codecs.code_page_decode(cp, raw, errors, True) 3048 except UnicodeDecodeError as err: 3049 self.fail('Unable to decode %a from "cp%s" with ' 3050 'errors=%r: %s' % (raw, cp, errors, err)) 3051 self.assertEqual(decoded[0], expected, 3052 '%a.decode("cp%s", %r)=%a != %a' 3053 % (raw, cp, errors, decoded[0], expected)) 3054 # assert 0 <= decoded[1] <= len(raw) 3055 self.assertGreaterEqual(decoded[1], 0) 3056 self.assertLessEqual(decoded[1], len(raw)) 3057 else: 3058 self.assertRaises(UnicodeDecodeError, 3059 codecs.code_page_decode, cp, raw, errors, True) 3060 3061 def check_encode(self, cp, tests): 3062 for text, errors, expected in tests: 3063 if expected is not None: 3064 try: 3065 encoded = codecs.code_page_encode(cp, text, errors) 3066 except UnicodeEncodeError as err: 3067 self.fail('Unable to encode %a to "cp%s" with ' 3068 'errors=%r: %s' % (text, cp, errors, err)) 3069 self.assertEqual(encoded[0], expected, 3070 '%a.encode("cp%s", %r)=%a != %a' 3071 % (text, cp, errors, encoded[0], expected)) 3072 self.assertEqual(encoded[1], len(text)) 3073 else: 3074 self.assertRaises(UnicodeEncodeError, 3075 codecs.code_page_encode, cp, text, errors) 3076 3077 def test_cp932(self): 3078 self.check_encode(932, ( 3079 ('abc', 'strict', b'abc'), 3080 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), 3081 # test error handlers 3082 ('\xff', 'strict', None), 3083 ('[\xff]', 'ignore', b'[]'), 3084 ('[\xff]', 'replace', b'[y]'), 3085 ('[\u20ac]', 'replace', b'[?]'), 3086 ('[\xff]', 'backslashreplace', b'[\\xff]'), 3087 ('[\xff]', 'namereplace', 3088 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), 3089 ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), 3090 ('\udcff', 'strict', None), 3091 ('[\udcff]', 'surrogateescape', b'[\xff]'), 3092 ('[\udcff]', 'surrogatepass', None), 3093 )) 3094 self.check_decode(932, ( 3095 (b'abc', 'strict', 'abc'), 3096 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), 3097 # invalid bytes 3098 (b'[\xff]', 'strict', None), 3099 (b'[\xff]', 'ignore', '[]'), 3100 (b'[\xff]', 'replace', '[\ufffd]'), 3101 (b'[\xff]', 'backslashreplace', '[\\xff]'), 3102 (b'[\xff]', 'surrogateescape', '[\udcff]'), 3103 (b'[\xff]', 'surrogatepass', None), 3104 (b'\x81\x00abc', 'strict', None), 3105 (b'\x81\x00abc', 'ignore', '\x00abc'), 3106 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), 3107 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), 3108 )) 3109 3110 def test_cp1252(self): 3111 self.check_encode(1252, ( 3112 ('abc', 'strict', b'abc'), 3113 ('\xe9\u20ac', 'strict', b'\xe9\x80'), 3114 ('\xff', 'strict', b'\xff'), 3115 # test error handlers 3116 ('\u0141', 'strict', None), 3117 ('\u0141', 'ignore', b''), 3118 ('\u0141', 'replace', b'L'), 3119 ('\udc98', 'surrogateescape', b'\x98'), 3120 ('\udc98', 'surrogatepass', None), 3121 )) 3122 self.check_decode(1252, ( 3123 (b'abc', 'strict', 'abc'), 3124 (b'\xe9\x80', 'strict', '\xe9\u20ac'), 3125 (b'\xff', 'strict', '\xff'), 3126 )) 3127 3128 def test_cp_utf7(self): 3129 cp = 65000 3130 self.check_encode(cp, ( 3131 ('abc', 'strict', b'abc'), 3132 ('\xe9\u20ac', 'strict', b'+AOkgrA-'), 3133 ('\U0010ffff', 'strict', b'+2//f/w-'), 3134 ('\udc80', 'strict', b'+3IA-'), 3135 ('\ufffd', 'strict', b'+//0-'), 3136 )) 3137 self.check_decode(cp, ( 3138 (b'abc', 'strict', 'abc'), 3139 (b'+AOkgrA-', 'strict', '\xe9\u20ac'), 3140 (b'+2//f/w-', 'strict', '\U0010ffff'), 3141 (b'+3IA-', 'strict', '\udc80'), 3142 (b'+//0-', 'strict', '\ufffd'), 3143 # invalid bytes 3144 (b'[+/]', 'strict', '[]'), 3145 (b'[\xff]', 'strict', '[\xff]'), 3146 )) 3147 3148 def test_multibyte_encoding(self): 3149 self.check_decode(932, ( 3150 (b'\x84\xe9\x80', 'ignore', '\u9a3e'), 3151 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'), 3152 )) 3153 self.check_decode(self.CP_UTF8, ( 3154 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), 3155 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), 3156 )) 3157 self.check_encode(self.CP_UTF8, ( 3158 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), 3159 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), 3160 )) 3161 3162 def test_incremental(self): 3163 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) 3164 self.assertEqual(decoded, ('', 0)) 3165 3166 decoded = codecs.code_page_decode(932, 3167 b'\xe9\x80\xe9', 'strict', 3168 False) 3169 self.assertEqual(decoded, ('\u9a3e', 2)) 3170 3171 decoded = codecs.code_page_decode(932, 3172 b'\xe9\x80\xe9\x80', 'strict', 3173 False) 3174 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4)) 3175 3176 decoded = codecs.code_page_decode(932, 3177 b'abc', 'strict', 3178 False) 3179 self.assertEqual(decoded, ('abc', 3)) 3180 3181 def test_mbcs_alias(self): 3182 # Check that looking up our 'default' codepage will return 3183 # mbcs when we don't have a more specific one available 3184 with mock.patch('_winapi.GetACP', return_value=123): 3185 codec = codecs.lookup('cp123') 3186 self.assertEqual(codec.name, 'mbcs') 3187 3188 @support.bigmemtest(size=2**31, memuse=7, dry_run=False) 3189 def test_large_input(self): 3190 # Test input longer than INT_MAX. 3191 # Input should contain undecodable bytes before and after 3192 # the INT_MAX limit. 3193 encoded = (b'01234567' * (2**28-1) + 3194 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff') 3195 self.assertEqual(len(encoded), 2**31+2) 3196 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True) 3197 self.assertEqual(decoded[1], len(encoded)) 3198 del encoded 3199 self.assertEqual(len(decoded[0]), decoded[1]) 3200 self.assertEqual(decoded[0][:10], '0123456701') 3201 self.assertEqual(decoded[0][-20:], 3202 '6701234567' 3203 '\udc85\udc86\udcea\udceb\udcec' 3204 '\udcef\udcfc\udcfd\udcfe\udcff') 3205 3206 3207 class ASCIITest(unittest.TestCase): 3208 def test_encode(self): 3209 self.assertEqual('abc123'.encode('ascii'), b'abc123') 3210 3211 def test_encode_error(self): 3212 for data, error_handler, expected in ( 3213 ('[\x80\xff\u20ac]', 'ignore', b'[]'), 3214 ('[\x80\xff\u20ac]', 'replace', b'[???]'), 3215 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), 3216 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', 3217 b'[\\x80\\xff\\u20ac\\U000abcde]'), 3218 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3219 ): 3220 with self.subTest(data=data, error_handler=error_handler, 3221 expected=expected): 3222 self.assertEqual(data.encode('ascii', error_handler), 3223 expected) 3224 3225 def test_encode_surrogateescape_error(self): 3226 with self.assertRaises(UnicodeEncodeError): 3227 # the first character can be decoded, but not the second 3228 '\udc80\xff'.encode('ascii', 'surrogateescape') 3229 3230 def test_decode(self): 3231 self.assertEqual(b'abc'.decode('ascii'), 'abc') 3232 3233 def test_decode_error(self): 3234 for data, error_handler, expected in ( 3235 (b'[\x80\xff]', 'ignore', '[]'), 3236 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), 3237 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), 3238 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), 3239 ): 3240 with self.subTest(data=data, error_handler=error_handler, 3241 expected=expected): 3242 self.assertEqual(data.decode('ascii', error_handler), 3243 expected) 3244 3245 3246 class Latin1Test(unittest.TestCase): 3247 def test_encode(self): 3248 for data, expected in ( 3249 ('abc', b'abc'), 3250 ('\x80\xe9\xff', b'\x80\xe9\xff'), 3251 ): 3252 with self.subTest(data=data, expected=expected): 3253 self.assertEqual(data.encode('latin1'), expected) 3254 3255 def test_encode_errors(self): 3256 for data, error_handler, expected in ( 3257 ('[\u20ac\udc80]', 'ignore', b'[]'), 3258 ('[\u20ac\udc80]', 'replace', b'[??]'), 3259 ('[\u20ac\U000abcde]', 'backslashreplace', 3260 b'[\\u20ac\\U000abcde]'), 3261 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), 3262 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), 3263 ): 3264 with self.subTest(data=data, error_handler=error_handler, 3265 expected=expected): 3266 self.assertEqual(data.encode('latin1', error_handler), 3267 expected) 3268 3269 def test_encode_surrogateescape_error(self): 3270 with self.assertRaises(UnicodeEncodeError): 3271 # the first character can be decoded, but not the second 3272 '\udc80\u20ac'.encode('latin1', 'surrogateescape') 3273 3274 def test_decode(self): 3275 for data, expected in ( 3276 (b'abc', 'abc'), 3277 (b'[\x80\xff]', '[\x80\xff]'), 3278 ): 3279 with self.subTest(data=data, expected=expected): 3280 self.assertEqual(data.decode('latin1'), expected) 3281 3282 3283 if __name__ == "__main__": 3284 unittest.main() 3285