1 from test import test_support 2 import unittest 3 import codecs 4 import locale 5 import sys, StringIO 6 7 def coding_checker(self, coder): 8 def check(input, expect): 9 self.assertEqual(coder(input), (expect, len(input))) 10 return check 11 12 class Queue(object): 13 """ 14 queue: write bytes at one end, read bytes from the other end 15 """ 16 def __init__(self): 17 self._buffer = "" 18 19 def write(self, chars): 20 self._buffer += chars 21 22 def read(self, size=-1): 23 if size<0: 24 s = self._buffer 25 self._buffer = "" 26 return s 27 else: 28 s = self._buffer[:size] 29 self._buffer = self._buffer[size:] 30 return s 31 32 class ReadTest(unittest.TestCase): 33 def check_partial(self, input, partialresults): 34 # get a StreamReader for the encoding and feed the bytestring version 35 # of input to the reader byte by byte. Read everything available from 36 # the StreamReader and check that the results equal the appropriate 37 # entries from partialresults. 38 q = Queue() 39 r = codecs.getreader(self.encoding)(q) 40 result = u"" 41 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 42 q.write(c) 43 result += r.read() 44 self.assertEqual(result, partialresult) 45 # check that there's nothing left in the buffers 46 self.assertEqual(r.read(), u"") 47 self.assertEqual(r.bytebuffer, "") 48 self.assertEqual(r.charbuffer, u"") 49 50 # do the check again, this time using an incremental decoder 51 d = codecs.getincrementaldecoder(self.encoding)() 52 result = u"" 53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 54 result += d.decode(c) 55 self.assertEqual(result, partialresult) 56 # check that there's nothing left in the buffers 57 self.assertEqual(d.decode("", True), u"") 58 self.assertEqual(d.buffer, "") 59 60 # Check whether the reset method works properly 61 d.reset() 62 result = u"" 63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 64 result += d.decode(c) 65 self.assertEqual(result, partialresult) 66 # check that there's nothing left in the buffers 67 self.assertEqual(d.decode("", True), u"") 68 self.assertEqual(d.buffer, "") 69 70 # check iterdecode() 71 encoded = input.encode(self.encoding) 72 self.assertEqual( 73 input, 74 u"".join(codecs.iterdecode(encoded, self.encoding)) 75 ) 76 77 def test_readline(self): 78 def getreader(input): 79 stream = StringIO.StringIO(input.encode(self.encoding)) 80 return codecs.getreader(self.encoding)(stream) 81 82 def readalllines(input, keepends=True, size=None): 83 reader = getreader(input) 84 lines = [] 85 while True: 86 line = reader.readline(size=size, keepends=keepends) 87 if not line: 88 break 89 lines.append(line) 90 return "|".join(lines) 91 92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs" 93 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs" 94 sexpectednoends = u"foo|bar|baz|spam|eggs" 95 self.assertEqual(readalllines(s, True), sexpected) 96 self.assertEqual(readalllines(s, False), sexpectednoends) 97 self.assertEqual(readalllines(s, True, 10), sexpected) 98 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 99 100 lineends = ("\n", "\r\n", "\r", u"\u2028") 101 # Test long lines (multiple calls to read() in readline()) 102 vw = [] 103 vwo = [] 104 for (i, lineend) in enumerate(lineends): 105 vw.append((i*200+200)*u"\u3042" + lineend) 106 vwo.append((i*200+200)*u"\u3042") 107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw)) 108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo)) 109 110 # Test lines where the first read might end with \r, so the 111 # reader has to look ahead whether this is a lone \r or a \r\n 112 for size in xrange(80): 113 for lineend in lineends: 114 s = 10*(size*u"a" + lineend + u"xxx\n") 115 reader = getreader(s) 116 for i in xrange(10): 117 self.assertEqual( 118 reader.readline(keepends=True), 119 size*u"a" + lineend, 120 ) 121 self.assertEqual( 122 reader.readline(keepends=True), 123 "xxx\n", 124 ) 125 reader = getreader(s) 126 for i in xrange(10): 127 self.assertEqual( 128 reader.readline(keepends=False), 129 size*u"a", 130 ) 131 self.assertEqual( 132 reader.readline(keepends=False), 133 "xxx", 134 ) 135 136 def test_mixed_readline_and_read(self): 137 lines = ["Humpty Dumpty sat on a wall,\n", 138 "Humpty Dumpty had a great fall.\r\n", 139 "All the king's horses and all the king's men\r", 140 "Couldn't put Humpty together again."] 141 data = ''.join(lines) 142 def getreader(): 143 stream = StringIO.StringIO(data.encode(self.encoding)) 144 return codecs.getreader(self.encoding)(stream) 145 146 # Issue #8260: Test readline() followed by read() 147 f = getreader() 148 self.assertEqual(f.readline(), lines[0]) 149 self.assertEqual(f.read(), ''.join(lines[1:])) 150 self.assertEqual(f.read(), '') 151 152 # Issue #32110: Test readline() followed by read(n) 153 f = getreader() 154 self.assertEqual(f.readline(), lines[0]) 155 self.assertEqual(f.read(1), lines[1][0]) 156 self.assertEqual(f.read(0), '') 157 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100]) 158 159 # Issue #16636: Test readline() followed by readlines() 160 f = getreader() 161 self.assertEqual(f.readline(), lines[0]) 162 self.assertEqual(f.readlines(), lines[1:]) 163 self.assertEqual(f.read(), '') 164 165 # Test read(n) followed by read() 166 f = getreader() 167 self.assertEqual(f.read(size=40, chars=5), data[:5]) 168 self.assertEqual(f.read(), data[5:]) 169 self.assertEqual(f.read(), '') 170 171 # Issue #32110: Test read(n) followed by read(n) 172 f = getreader() 173 self.assertEqual(f.read(size=40, chars=5), data[:5]) 174 self.assertEqual(f.read(1), data[5]) 175 self.assertEqual(f.read(0), '') 176 self.assertEqual(f.read(100), data[6:106]) 177 178 # Issue #12446: Test read(n) followed by readlines() 179 f = getreader() 180 self.assertEqual(f.read(size=40, chars=5), data[:5]) 181 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:]) 182 self.assertEqual(f.read(), '') 183 184 def test_bug1175396(self): 185 s = [ 186 '<%!--===================================================\r\n', 187 ' BLOG index page: show recent articles,\r\n', 188 ' today\'s articles, or articles of a specific date.\r\n', 189 '========================================================--%>\r\n', 190 '<%@inputencoding="ISO-8859-1"%>\r\n', 191 '<%@pagetemplate=TEMPLATE.y%>\r\n', 192 '<%@import=import frog.util, frog%>\r\n', 193 '<%@import=import frog.objects%>\r\n', 194 '<%@import=from frog.storageerrors import StorageError%>\r\n', 195 '<%\r\n', 196 '\r\n', 197 'import logging\r\n', 198 'log=logging.getLogger("Snakelets.logger")\r\n', 199 '\r\n', 200 '\r\n', 201 'user=self.SessionCtx.user\r\n', 202 'storageEngine=self.SessionCtx.storageEngine\r\n', 203 '\r\n', 204 '\r\n', 205 'def readArticlesFromDate(date, count=None):\r\n', 206 ' entryids=storageEngine.listBlogEntries(date)\r\n', 207 ' entryids.reverse() # descending\r\n', 208 ' if count:\r\n', 209 ' entryids=entryids[:count]\r\n', 210 ' try:\r\n', 211 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 212 ' except StorageError,x:\r\n', 213 ' log.error("Error loading articles: "+str(x))\r\n', 214 ' self.abort("cannot load articles")\r\n', 215 '\r\n', 216 'showdate=None\r\n', 217 '\r\n', 218 'arg=self.Request.getArg()\r\n', 219 'if arg=="today":\r\n', 220 ' #-------------------- TODAY\'S ARTICLES\r\n', 221 ' self.write("<h2>Today\'s articles</h2>")\r\n', 222 ' showdate = frog.util.isodatestr() \r\n', 223 ' entries = readArticlesFromDate(showdate)\r\n', 224 'elif arg=="active":\r\n', 225 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 226 ' self.Yredirect("active.y")\r\n', 227 'elif arg=="login":\r\n', 228 ' #-------------------- LOGIN PAGE redirect\r\n', 229 ' self.Yredirect("login.y")\r\n', 230 'elif arg=="date":\r\n', 231 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 232 ' showdate = self.Request.getParameter("date")\r\n', 233 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 234 ' entries = readArticlesFromDate(showdate)\r\n', 235 'else:\r\n', 236 ' #-------------------- RECENT ARTICLES\r\n', 237 ' self.write("<h2>Recent articles</h2>")\r\n', 238 ' dates=storageEngine.listBlogEntryDates()\r\n', 239 ' if dates:\r\n', 240 ' entries=[]\r\n', 241 ' SHOWAMOUNT=10\r\n', 242 ' for showdate in dates:\r\n', 243 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 244 ' if len(entries)>=SHOWAMOUNT:\r\n', 245 ' break\r\n', 246 ' \r\n', 247 ] 248 stream = StringIO.StringIO("".join(s).encode(self.encoding)) 249 reader = codecs.getreader(self.encoding)(stream) 250 for (i, line) in enumerate(reader): 251 self.assertEqual(line, s[i]) 252 253 def test_readlinequeue(self): 254 q = Queue() 255 writer = codecs.getwriter(self.encoding)(q) 256 reader = codecs.getreader(self.encoding)(q) 257 258 # No lineends 259 writer.write(u"foo\r") 260 self.assertEqual(reader.readline(keepends=False), u"foo") 261 writer.write(u"\nbar\r") 262 self.assertEqual(reader.readline(keepends=False), u"") 263 self.assertEqual(reader.readline(keepends=False), u"bar") 264 writer.write(u"baz") 265 self.assertEqual(reader.readline(keepends=False), u"baz") 266 self.assertEqual(reader.readline(keepends=False), u"") 267 268 # Lineends 269 writer.write(u"foo\r") 270 self.assertEqual(reader.readline(keepends=True), u"foo\r") 271 writer.write(u"\nbar\r") 272 self.assertEqual(reader.readline(keepends=True), u"\n") 273 self.assertEqual(reader.readline(keepends=True), u"bar\r") 274 writer.write(u"baz") 275 self.assertEqual(reader.readline(keepends=True), u"baz") 276 self.assertEqual(reader.readline(keepends=True), u"") 277 writer.write(u"foo\r\n") 278 self.assertEqual(reader.readline(keepends=True), u"foo\r\n") 279 280 def test_bug1098990_a(self): 281 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 282 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 283 s3 = u"next line.\r\n" 284 285 s = (s1+s2+s3).encode(self.encoding) 286 stream = StringIO.StringIO(s) 287 reader = codecs.getreader(self.encoding)(stream) 288 self.assertEqual(reader.readline(), s1) 289 self.assertEqual(reader.readline(), s2) 290 self.assertEqual(reader.readline(), s3) 291 self.assertEqual(reader.readline(), u"") 292 293 def test_bug1098990_b(self): 294 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 295 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 296 s3 = u"stillokay:bbbbxx\r\n" 297 s4 = u"broken!!!!badbad\r\n" 298 s5 = u"againokay.\r\n" 299 300 s = (s1+s2+s3+s4+s5).encode(self.encoding) 301 stream = StringIO.StringIO(s) 302 reader = codecs.getreader(self.encoding)(stream) 303 self.assertEqual(reader.readline(), s1) 304 self.assertEqual(reader.readline(), s2) 305 self.assertEqual(reader.readline(), s3) 306 self.assertEqual(reader.readline(), s4) 307 self.assertEqual(reader.readline(), s5) 308 self.assertEqual(reader.readline(), u"") 309 310 class UTF32Test(ReadTest): 311 encoding = "utf-32" 312 313 spamle = ('\xff\xfe\x00\x00' 314 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 315 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 316 spambe = ('\x00\x00\xfe\xff' 317 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 318 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 319 320 def test_only_one_bom(self): 321 _,_,reader,writer = codecs.lookup(self.encoding) 322 # encode some stream 323 s = StringIO.StringIO() 324 f = writer(s) 325 f.write(u"spam") 326 f.write(u"spam") 327 d = s.getvalue() 328 # check whether there is exactly one BOM in it 329 self.assertTrue(d == self.spamle or d == self.spambe) 330 # try to read it back 331 s = StringIO.StringIO(d) 332 f = reader(s) 333 self.assertEqual(f.read(), u"spamspam") 334 335 def test_badbom(self): 336 s = StringIO.StringIO(4*"\xff") 337 f = codecs.getreader(self.encoding)(s) 338 self.assertRaises(UnicodeError, f.read) 339 340 s = StringIO.StringIO(8*"\xff") 341 f = codecs.getreader(self.encoding)(s) 342 self.assertRaises(UnicodeError, f.read) 343 344 def test_partial(self): 345 self.check_partial( 346 u"\x00\xff\u0100\uffff\U00010000", 347 [ 348 u"", # first byte of BOM read 349 u"", # second byte of BOM read 350 u"", # third byte of BOM read 351 u"", # fourth byte of BOM read => byteorder known 352 u"", 353 u"", 354 u"", 355 u"\x00", 356 u"\x00", 357 u"\x00", 358 u"\x00", 359 u"\x00\xff", 360 u"\x00\xff", 361 u"\x00\xff", 362 u"\x00\xff", 363 u"\x00\xff\u0100", 364 u"\x00\xff\u0100", 365 u"\x00\xff\u0100", 366 u"\x00\xff\u0100", 367 u"\x00\xff\u0100\uffff", 368 u"\x00\xff\u0100\uffff", 369 u"\x00\xff\u0100\uffff", 370 u"\x00\xff\u0100\uffff", 371 u"\x00\xff\u0100\uffff\U00010000", 372 ] 373 ) 374 375 def test_handlers(self): 376 self.assertEqual((u'\ufffd', 1), 377 codecs.utf_32_decode('\x01', 'replace', True)) 378 self.assertEqual((u'', 1), 379 codecs.utf_32_decode('\x01', 'ignore', True)) 380 381 def test_errors(self): 382 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 383 "\xff", "strict", True) 384 385 def test_issue8941(self): 386 # Issue #8941: insufficient result allocation when decoding into 387 # surrogate pairs on UCS-2 builds. 388 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024 389 self.assertEqual(u'\U00010000' * 1024, 390 codecs.utf_32_decode(encoded_le)[0]) 391 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024 392 self.assertEqual(u'\U00010000' * 1024, 393 codecs.utf_32_decode(encoded_be)[0]) 394 395 class UTF32LETest(ReadTest): 396 encoding = "utf-32-le" 397 398 def test_partial(self): 399 self.check_partial( 400 u"\x00\xff\u0100\uffff\U00010000", 401 [ 402 u"", 403 u"", 404 u"", 405 u"\x00", 406 u"\x00", 407 u"\x00", 408 u"\x00", 409 u"\x00\xff", 410 u"\x00\xff", 411 u"\x00\xff", 412 u"\x00\xff", 413 u"\x00\xff\u0100", 414 u"\x00\xff\u0100", 415 u"\x00\xff\u0100", 416 u"\x00\xff\u0100", 417 u"\x00\xff\u0100\uffff", 418 u"\x00\xff\u0100\uffff", 419 u"\x00\xff\u0100\uffff", 420 u"\x00\xff\u0100\uffff", 421 u"\x00\xff\u0100\uffff\U00010000", 422 ] 423 ) 424 425 def test_simple(self): 426 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00") 427 428 def test_errors(self): 429 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 430 "\xff", "strict", True) 431 432 def test_issue8941(self): 433 # Issue #8941: insufficient result allocation when decoding into 434 # surrogate pairs on UCS-2 builds. 435 encoded = '\x00\x00\x01\x00' * 1024 436 self.assertEqual(u'\U00010000' * 1024, 437 codecs.utf_32_le_decode(encoded)[0]) 438 439 class UTF32BETest(ReadTest): 440 encoding = "utf-32-be" 441 442 def test_partial(self): 443 self.check_partial( 444 u"\x00\xff\u0100\uffff\U00010000", 445 [ 446 u"", 447 u"", 448 u"", 449 u"\x00", 450 u"\x00", 451 u"\x00", 452 u"\x00", 453 u"\x00\xff", 454 u"\x00\xff", 455 u"\x00\xff", 456 u"\x00\xff", 457 u"\x00\xff\u0100", 458 u"\x00\xff\u0100", 459 u"\x00\xff\u0100", 460 u"\x00\xff\u0100", 461 u"\x00\xff\u0100\uffff", 462 u"\x00\xff\u0100\uffff", 463 u"\x00\xff\u0100\uffff", 464 u"\x00\xff\u0100\uffff", 465 u"\x00\xff\u0100\uffff\U00010000", 466 ] 467 ) 468 469 def test_simple(self): 470 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03") 471 472 def test_errors(self): 473 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 474 "\xff", "strict", True) 475 476 def test_issue8941(self): 477 # Issue #8941: insufficient result allocation when decoding into 478 # surrogate pairs on UCS-2 builds. 479 encoded = '\x00\x01\x00\x00' * 1024 480 self.assertEqual(u'\U00010000' * 1024, 481 codecs.utf_32_be_decode(encoded)[0]) 482 483 484 class UTF16Test(ReadTest): 485 encoding = "utf-16" 486 487 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 488 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 489 490 def test_only_one_bom(self): 491 _,_,reader,writer = codecs.lookup(self.encoding) 492 # encode some stream 493 s = StringIO.StringIO() 494 f = writer(s) 495 f.write(u"spam") 496 f.write(u"spam") 497 d = s.getvalue() 498 # check whether there is exactly one BOM in it 499 self.assertTrue(d == self.spamle or d == self.spambe) 500 # try to read it back 501 s = StringIO.StringIO(d) 502 f = reader(s) 503 self.assertEqual(f.read(), u"spamspam") 504 505 def test_badbom(self): 506 s = StringIO.StringIO("\xff\xff") 507 f = codecs.getreader(self.encoding)(s) 508 self.assertRaises(UnicodeError, f.read) 509 510 s = StringIO.StringIO("\xff\xff\xff\xff") 511 f = codecs.getreader(self.encoding)(s) 512 self.assertRaises(UnicodeError, f.read) 513 514 def test_partial(self): 515 self.check_partial( 516 u"\x00\xff\u0100\uffff\U00010000", 517 [ 518 u"", # first byte of BOM read 519 u"", # second byte of BOM read => byteorder known 520 u"", 521 u"\x00", 522 u"\x00", 523 u"\x00\xff", 524 u"\x00\xff", 525 u"\x00\xff\u0100", 526 u"\x00\xff\u0100", 527 u"\x00\xff\u0100\uffff", 528 u"\x00\xff\u0100\uffff", 529 u"\x00\xff\u0100\uffff", 530 u"\x00\xff\u0100\uffff", 531 u"\x00\xff\u0100\uffff\U00010000", 532 ] 533 ) 534 535 def test_handlers(self): 536 self.assertEqual((u'\ufffd', 1), 537 codecs.utf_16_decode('\x01', 'replace', True)) 538 self.assertEqual((u'', 1), 539 codecs.utf_16_decode('\x01', 'ignore', True)) 540 541 def test_errors(self): 542 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True) 543 544 def test_bug691291(self): 545 # Files are always opened in binary mode, even if no binary mode was 546 # specified. This means that no automatic conversion of '\n' is done 547 # on reading and writing. 548 s1 = u'Hello\r\nworld\r\n' 549 550 s = s1.encode(self.encoding) 551 self.addCleanup(test_support.unlink, test_support.TESTFN) 552 with open(test_support.TESTFN, 'wb') as fp: 553 fp.write(s) 554 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader: 555 self.assertEqual(reader.read(), s1) 556 557 class UTF16LETest(ReadTest): 558 encoding = "utf-16-le" 559 560 def test_partial(self): 561 self.check_partial( 562 u"\x00\xff\u0100\uffff\U00010000", 563 [ 564 u"", 565 u"\x00", 566 u"\x00", 567 u"\x00\xff", 568 u"\x00\xff", 569 u"\x00\xff\u0100", 570 u"\x00\xff\u0100", 571 u"\x00\xff\u0100\uffff", 572 u"\x00\xff\u0100\uffff", 573 u"\x00\xff\u0100\uffff", 574 u"\x00\xff\u0100\uffff", 575 u"\x00\xff\u0100\uffff\U00010000", 576 ] 577 ) 578 579 def test_errors(self): 580 tests = [ 581 (b'\xff', u'\ufffd'), 582 (b'A\x00Z', u'A\ufffd'), 583 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'), 584 (b'\x00\xd8', u'\ufffd'), 585 (b'\x00\xd8A', u'\ufffd'), 586 (b'\x00\xd8A\x00', u'\ufffdA'), 587 (b'\x00\xdcA\x00', u'\ufffdA'), 588 ] 589 for raw, expected in tests: 590 try: 591 with self.assertRaises(UnicodeDecodeError): 592 codecs.utf_16_le_decode(raw, 'strict', True) 593 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 594 except: 595 print 'raw=%r' % raw 596 raise 597 598 class UTF16BETest(ReadTest): 599 encoding = "utf-16-be" 600 601 def test_partial(self): 602 self.check_partial( 603 u"\x00\xff\u0100\uffff\U00010000", 604 [ 605 u"", 606 u"\x00", 607 u"\x00", 608 u"\x00\xff", 609 u"\x00\xff", 610 u"\x00\xff\u0100", 611 u"\x00\xff\u0100", 612 u"\x00\xff\u0100\uffff", 613 u"\x00\xff\u0100\uffff", 614 u"\x00\xff\u0100\uffff", 615 u"\x00\xff\u0100\uffff", 616 u"\x00\xff\u0100\uffff\U00010000", 617 ] 618 ) 619 620 def test_errors(self): 621 tests = [ 622 (b'\xff', u'\ufffd'), 623 (b'\x00A\xff', u'A\ufffd'), 624 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'), 625 (b'\xd8\x00', u'\ufffd'), 626 (b'\xd8\x00\xdc', u'\ufffd'), 627 (b'\xd8\x00\x00A', u'\ufffdA'), 628 (b'\xdc\x00\x00A', u'\ufffdA'), 629 ] 630 for raw, expected in tests: 631 try: 632 with self.assertRaises(UnicodeDecodeError): 633 codecs.utf_16_be_decode(raw, 'strict', True) 634 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 635 except: 636 print 'raw=%r' % raw 637 raise 638 639 class UTF8Test(ReadTest): 640 encoding = "utf-8" 641 642 def test_partial(self): 643 self.check_partial( 644 u"\x00\xff\u07ff\u0800\uffff\U00010000", 645 [ 646 u"\x00", 647 u"\x00", 648 u"\x00\xff", 649 u"\x00\xff", 650 u"\x00\xff\u07ff", 651 u"\x00\xff\u07ff", 652 u"\x00\xff\u07ff", 653 u"\x00\xff\u07ff\u0800", 654 u"\x00\xff\u07ff\u0800", 655 u"\x00\xff\u07ff\u0800", 656 u"\x00\xff\u07ff\u0800\uffff", 657 u"\x00\xff\u07ff\u0800\uffff", 658 u"\x00\xff\u07ff\u0800\uffff", 659 u"\x00\xff\u07ff\u0800\uffff", 660 u"\x00\xff\u07ff\u0800\uffff\U00010000", 661 ] 662 ) 663 664 class UTF7Test(ReadTest): 665 encoding = "utf-7" 666 667 def test_ascii(self): 668 # Set D (directly encoded characters) 669 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 670 'abcdefghijklmnopqrstuvwxyz' 671 '0123456789' 672 '\'(),-./:?') 673 self.assertEqual(set_d.encode(self.encoding), set_d) 674 self.assertEqual(set_d.decode(self.encoding), set_d) 675 # Set O (optional direct characters) 676 set_o = ' !"#$%&*;<=>@[]^_`{|}' 677 self.assertEqual(set_o.encode(self.encoding), set_o) 678 self.assertEqual(set_o.decode(self.encoding), set_o) 679 # + 680 self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b') 681 self.assertEqual('a+-b'.decode(self.encoding), u'a+b') 682 # White spaces 683 ws = ' \t\n\r' 684 self.assertEqual(ws.encode(self.encoding), ws) 685 self.assertEqual(ws.decode(self.encoding), ws) 686 # Other ASCII characters 687 other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) - 688 set(set_d + set_o + '+' + ws))) 689 self.assertEqual(other_ascii.encode(self.encoding), 690 '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' 691 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') 692 693 def test_partial(self): 694 self.check_partial( 695 u"a+-b", 696 [ 697 u"a", 698 u"a", 699 u"a+", 700 u"a+-", 701 u"a+-b", 702 ] 703 ) 704 705 def test_errors(self): 706 tests = [ 707 ('\xe1b', u'\ufffdb'), 708 ('a\xe1b', u'a\ufffdb'), 709 ('a\xe1\xe1b', u'a\ufffd\ufffdb'), 710 ('a+IK', u'a\ufffd'), 711 ('a+IK-b', u'a\ufffdb'), 712 ('a+IK,b', u'a\ufffdb'), 713 ('a+IKx', u'a\u20ac\ufffd'), 714 ('a+IKx-b', u'a\u20ac\ufffdb'), 715 ('a+IKwgr', u'a\u20ac\ufffd'), 716 ('a+IKwgr-b', u'a\u20ac\ufffdb'), 717 ('a+IKwgr,', u'a\u20ac\ufffd'), 718 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'), 719 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'), 720 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'), 721 ('a+/,+IKw-b', u'a\ufffd\u20acb'), 722 ('a+//,+IKw-b', u'a\ufffd\u20acb'), 723 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'), 724 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'), 725 ('a+IKw-b\xe1', u'a\u20acb\ufffd'), 726 ('a+IKw\xe1b', u'a\u20ac\ufffdb'), 727 ] 728 for raw, expected in tests: 729 try: 730 with self.assertRaises(UnicodeDecodeError): 731 codecs.utf_7_decode(raw, 'strict', True) 732 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 733 except: 734 print 'raw=%r' % raw 735 raise 736 737 def test_nonbmp(self): 738 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-') 739 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-') 740 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0') 741 self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0') 742 self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-') 743 self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0') 744 self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0') 745 self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding), 746 '+IKwgrNgB3KA-') 747 self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding), 748 u'\u20ac\u20ac\U000104A0') 749 self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding), 750 u'\u20ac\u20ac\U000104A0') 751 752 def test_lone_surrogates(self): 753 tests = [ 754 ('a+2AE-b', u'a\ud801b'), 755 ('a+2AE\xe1b', u'a\ufffdb'), 756 ('a+2AE', u'a\ufffd'), 757 ('a+2AEA-b', u'a\ufffdb'), 758 ('a+2AH-b', u'a\ufffdb'), 759 ('a+IKzYAQ-b', u'a\u20ac\ud801b'), 760 ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'), 761 ('a+IKzYAQA-b', u'a\u20ac\ufffdb'), 762 ('a+IKzYAd-b', u'a\u20ac\ufffdb'), 763 ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'), 764 ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'), 765 ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'), 766 ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'), 767 ] 768 for raw, expected in tests: 769 try: 770 self.assertEqual(raw.decode('utf-7', 'replace'), expected) 771 except: 772 print 'raw=%r' % raw 773 raise 774 775 class UTF16ExTest(unittest.TestCase): 776 777 def test_errors(self): 778 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True) 779 780 def test_bad_args(self): 781 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 782 783 class ReadBufferTest(unittest.TestCase): 784 785 def test_array(self): 786 import array 787 self.assertEqual( 788 codecs.readbuffer_encode(array.array("c", "spam")), 789 ("spam", 4) 790 ) 791 792 def test_empty(self): 793 self.assertEqual(codecs.readbuffer_encode(""), ("", 0)) 794 795 def test_bad_args(self): 796 self.assertRaises(TypeError, codecs.readbuffer_encode) 797 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 798 799 class CharBufferTest(unittest.TestCase): 800 801 def test_string(self): 802 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4)) 803 804 def test_empty(self): 805 self.assertEqual(codecs.charbuffer_encode(""), ("", 0)) 806 807 def test_bad_args(self): 808 self.assertRaises(TypeError, codecs.charbuffer_encode) 809 self.assertRaises(TypeError, codecs.charbuffer_encode, 42) 810 811 class UTF8SigTest(ReadTest): 812 encoding = "utf-8-sig" 813 814 def test_partial(self): 815 self.check_partial( 816 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 817 [ 818 u"", 819 u"", 820 u"", # First BOM has been read and skipped 821 u"", 822 u"", 823 u"\ufeff", # Second BOM has been read and emitted 824 u"\ufeff\x00", # "\x00" read and emitted 825 u"\ufeff\x00", # First byte of encoded u"\xff" read 826 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read 827 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read 828 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read 829 u"\ufeff\x00\xff\u07ff", 830 u"\ufeff\x00\xff\u07ff", 831 u"\ufeff\x00\xff\u07ff\u0800", 832 u"\ufeff\x00\xff\u07ff\u0800", 833 u"\ufeff\x00\xff\u07ff\u0800", 834 u"\ufeff\x00\xff\u07ff\u0800\uffff", 835 u"\ufeff\x00\xff\u07ff\u0800\uffff", 836 u"\ufeff\x00\xff\u07ff\u0800\uffff", 837 u"\ufeff\x00\xff\u07ff\u0800\uffff", 838 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 839 ] 840 ) 841 842 def test_bug1601501(self): 843 # SF bug #1601501: check that the codec works with a buffer 844 unicode("\xef\xbb\xbf", "utf-8-sig") 845 846 def test_bom(self): 847 d = codecs.getincrementaldecoder("utf-8-sig")() 848 s = u"spam" 849 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 850 851 def test_stream_bom(self): 852 unistring = u"ABC\u00A1\u2200XYZ" 853 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ" 854 855 reader = codecs.getreader("utf-8-sig") 856 for sizehint in [None] + range(1, 11) + \ 857 [64, 128, 256, 512, 1024]: 858 istream = reader(StringIO.StringIO(bytestring)) 859 ostream = StringIO.StringIO() 860 while 1: 861 if sizehint is not None: 862 data = istream.read(sizehint) 863 else: 864 data = istream.read() 865 866 if not data: 867 break 868 ostream.write(data) 869 870 got = ostream.getvalue() 871 self.assertEqual(got, unistring) 872 873 def test_stream_bare(self): 874 unistring = u"ABC\u00A1\u2200XYZ" 875 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ" 876 877 reader = codecs.getreader("utf-8-sig") 878 for sizehint in [None] + range(1, 11) + \ 879 [64, 128, 256, 512, 1024]: 880 istream = reader(StringIO.StringIO(bytestring)) 881 ostream = StringIO.StringIO() 882 while 1: 883 if sizehint is not None: 884 data = istream.read(sizehint) 885 else: 886 data = istream.read() 887 888 if not data: 889 break 890 ostream.write(data) 891 892 got = ostream.getvalue() 893 self.assertEqual(got, unistring) 894 895 class EscapeDecodeTest(unittest.TestCase): 896 def test_empty(self): 897 self.assertEqual(codecs.escape_decode(""), ("", 0)) 898 899 def test_raw(self): 900 decode = codecs.escape_decode 901 for b in range(256): 902 b = chr(b) 903 if b != '\\': 904 self.assertEqual(decode(b + '0'), (b + '0', 2)) 905 906 def test_escape(self): 907 decode = codecs.escape_decode 908 check = coding_checker(self, decode) 909 check(b"[\\\n]", b"[]") 910 check(br'[\"]', b'["]') 911 check(br"[\']", b"[']") 912 check(br"[\\]", br"[\]") 913 check(br"[\a]", b"[\x07]") 914 check(br"[\b]", b"[\x08]") 915 check(br"[\t]", b"[\x09]") 916 check(br"[\n]", b"[\x0a]") 917 check(br"[\v]", b"[\x0b]") 918 check(br"[\f]", b"[\x0c]") 919 check(br"[\r]", b"[\x0d]") 920 check(br"[\7]", b"[\x07]") 921 check(br"[\8]", br"[\8]") 922 check(br"[\78]", b"[\x078]") 923 check(br"[\41]", b"[!]") 924 check(br"[\418]", b"[!8]") 925 check(br"[\101]", b"[A]") 926 check(br"[\1010]", b"[A0]") 927 check(br"[\501]", b"[A]") 928 check(br"[\x41]", b"[A]") 929 check(br"[\X41]", br"[\X41]") 930 check(br"[\x410]", b"[A0]") 931 for b in range(256): 932 b = chr(b) 933 if b not in '\n"\'\\abtnvfr01234567x': 934 check('\\' + b, '\\' + b) 935 936 def test_errors(self): 937 decode = codecs.escape_decode 938 self.assertRaises(ValueError, decode, br"\x") 939 self.assertRaises(ValueError, decode, br"[\x]") 940 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 941 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 942 self.assertRaises(ValueError, decode, br"\x0") 943 self.assertRaises(ValueError, decode, br"[\x0]") 944 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 945 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 946 947 class RecodingTest(unittest.TestCase): 948 def test_recoding(self): 949 f = StringIO.StringIO() 950 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8") 951 f2.write(u"a") 952 f2.close() 953 # Python used to crash on this at exit because of a refcount 954 # bug in _codecsmodule.c 955 956 # From RFC 3492 957 punycode_testcases = [ 958 # A Arabic (Egyptian): 959 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 960 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 961 "egbpdaj6bu4bxfgehfvwxn"), 962 # B Chinese (simplified): 963 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 964 "ihqwcrb4cv8a8dqg056pqjye"), 965 # C Chinese (traditional): 966 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 967 "ihqwctvzc91f659drss3x8bo0yb"), 968 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 969 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 970 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 971 u"\u0065\u0073\u006B\u0079", 972 "Proprostnemluvesky-uyb24dma41a"), 973 # E Hebrew: 974 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 975 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 976 u"\u05D1\u05E8\u05D9\u05EA", 977 "4dbcagdahymbxekheh6e0a7fei0b"), 978 # F Hindi (Devanagari): 979 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 980 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 981 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 982 u"\u0939\u0948\u0902", 983 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 984 985 #(G) Japanese (kanji and hiragana): 986 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 987 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 988 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 989 990 # (H) Korean (Hangul syllables): 991 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 992 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 993 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 994 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 995 "psd879ccm6fea98c"), 996 997 # (I) Russian (Cyrillic): 998 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 999 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 1000 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 1001 u"\u0438", 1002 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 1003 1004 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 1005 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 1006 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 1007 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 1008 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 1009 u"\u0061\u00F1\u006F\u006C", 1010 "PorqunopuedensimplementehablarenEspaol-fmd56a"), 1011 1012 # (K) Vietnamese: 1013 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 1014 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 1015 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 1016 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 1017 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 1018 u"\u0056\u0069\u1EC7\u0074", 1019 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 1020 1021 #(L) 3<nen>B<gumi><kinpachi><sensei> 1022 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 1023 "3B-ww4c5e180e575a65lsy2b"), 1024 1025 # (M) <amuro><namie>-with-SUPER-MONKEYS 1026 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 1027 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 1028 u"\u004F\u004E\u004B\u0045\u0059\u0053", 1029 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 1030 1031 # (N) Hello-Another-Way-<sorezore><no><basho> 1032 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 1033 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 1034 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 1035 "Hello-Another-Way--fc4qua05auwb3674vfr0b"), 1036 1037 # (O) <hitotsu><yane><no><shita>2 1038 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 1039 "2-u9tlzr9756bt3uc0v"), 1040 1041 # (P) Maji<de>Koi<suru>5<byou><mae> 1042 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 1043 u"\u308B\u0035\u79D2\u524D", 1044 "MajiKoi5-783gue6qz075azm5e"), 1045 1046 # (Q) <pafii>de<runba> 1047 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 1048 "de-jg4avhby1noc0d"), 1049 1050 # (R) <sono><supiido><de> 1051 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 1052 "d9juau41awczczp"), 1053 1054 # (S) -> $1.00 <- 1055 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 1056 u"\u003C\u002D", 1057 "-> $1.00 <--") 1058 ] 1059 1060 for i in punycode_testcases: 1061 if len(i)!=2: 1062 print repr(i) 1063 1064 class PunycodeTest(unittest.TestCase): 1065 def test_encode(self): 1066 for uni, puny in punycode_testcases: 1067 # Need to convert both strings to lower case, since 1068 # some of the extended encodings use upper case, but our 1069 # code produces only lower case. Converting just puny to 1070 # lower is also insufficient, since some of the input characters 1071 # are upper case. 1072 self.assertEqual(uni.encode("punycode").lower(), puny.lower()) 1073 1074 def test_decode(self): 1075 for uni, puny in punycode_testcases: 1076 self.assertEqual(uni, puny.decode("punycode")) 1077 1078 class UnicodeInternalTest(unittest.TestCase): 1079 def test_bug1251300(self): 1080 # Decoding with unicode_internal used to not correctly handle "code 1081 # points" above 0x10ffff on UCS-4 builds. 1082 if sys.maxunicode > 0xffff: 1083 ok = [ 1084 ("\x00\x10\xff\xff", u"\U0010ffff"), 1085 ("\x00\x00\x01\x01", u"\U00000101"), 1086 ("", u""), 1087 ] 1088 not_ok = [ 1089 "\x7f\xff\xff\xff", 1090 "\x80\x00\x00\x00", 1091 "\x81\x00\x00\x00", 1092 "\x00", 1093 "\x00\x00\x00\x00\x00", 1094 ] 1095 for internal, uni in ok: 1096 if sys.byteorder == "little": 1097 internal = "".join(reversed(internal)) 1098 self.assertEqual(uni, internal.decode("unicode_internal")) 1099 for internal in not_ok: 1100 if sys.byteorder == "little": 1101 internal = "".join(reversed(internal)) 1102 self.assertRaises(UnicodeDecodeError, internal.decode, 1103 "unicode_internal") 1104 1105 def test_decode_error_attributes(self): 1106 if sys.maxunicode > 0xffff: 1107 try: 1108 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") 1109 except UnicodeDecodeError, ex: 1110 self.assertEqual("unicode_internal", ex.encoding) 1111 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) 1112 self.assertEqual(4, ex.start) 1113 self.assertEqual(8, ex.end) 1114 else: 1115 self.fail() 1116 1117 def test_decode_callback(self): 1118 if sys.maxunicode > 0xffff: 1119 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) 1120 decoder = codecs.getdecoder("unicode_internal") 1121 ab = u"ab".encode("unicode_internal") 1122 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), 1123 "UnicodeInternalTest") 1124 self.assertEqual((u"ab", 12), ignored) 1125 1126 def test_encode_length(self): 1127 # Issue 3739 1128 encoder = codecs.getencoder("unicode_internal") 1129 self.assertEqual(encoder(u"a")[1], 1) 1130 self.assertEqual(encoder(u"\xe9\u0142")[1], 2) 1131 1132 encoder = codecs.getencoder("string-escape") 1133 self.assertEqual(encoder(r'\x00')[1], 4) 1134 1135 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 1136 nameprep_tests = [ 1137 # 3.1 Map to nothing. 1138 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 1139 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 1140 '\xb8\x8f\xef\xbb\xbf', 1141 'foobarbaz'), 1142 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 1143 ('CAFE', 1144 'cafe'), 1145 # 3.3 Case folding 8bit U+00DF (german sharp s). 1146 # The original test case is bogus; it says \xc3\xdf 1147 ('\xc3\x9f', 1148 'ss'), 1149 # 3.4 Case folding U+0130 (turkish capital I with dot). 1150 ('\xc4\xb0', 1151 'i\xcc\x87'), 1152 # 3.5 Case folding multibyte U+0143 U+037A. 1153 ('\xc5\x83\xcd\xba', 1154 '\xc5\x84 \xce\xb9'), 1155 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 1156 # XXX: skip this as it fails in UCS-2 mode 1157 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 1158 # 'telc\xe2\x88\x95kg\xcf\x83'), 1159 (None, None), 1160 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1161 ('j\xcc\x8c\xc2\xa0\xc2\xaa', 1162 '\xc7\xb0 a'), 1163 # 3.8 Case folding U+1FB7 and normalization. 1164 ('\xe1\xbe\xb7', 1165 '\xe1\xbe\xb6\xce\xb9'), 1166 # 3.9 Self-reverting case folding U+01F0 and normalization. 1167 # The original test case is bogus, it says `\xc7\xf0' 1168 ('\xc7\xb0', 1169 '\xc7\xb0'), 1170 # 3.10 Self-reverting case folding U+0390 and normalization. 1171 ('\xce\x90', 1172 '\xce\x90'), 1173 # 3.11 Self-reverting case folding U+03B0 and normalization. 1174 ('\xce\xb0', 1175 '\xce\xb0'), 1176 # 3.12 Self-reverting case folding U+1E96 and normalization. 1177 ('\xe1\xba\x96', 1178 '\xe1\xba\x96'), 1179 # 3.13 Self-reverting case folding U+1F56 and normalization. 1180 ('\xe1\xbd\x96', 1181 '\xe1\xbd\x96'), 1182 # 3.14 ASCII space character U+0020. 1183 (' ', 1184 ' '), 1185 # 3.15 Non-ASCII 8bit space character U+00A0. 1186 ('\xc2\xa0', 1187 ' '), 1188 # 3.16 Non-ASCII multibyte space character U+1680. 1189 ('\xe1\x9a\x80', 1190 None), 1191 # 3.17 Non-ASCII multibyte space character U+2000. 1192 ('\xe2\x80\x80', 1193 ' '), 1194 # 3.18 Zero Width Space U+200b. 1195 ('\xe2\x80\x8b', 1196 ''), 1197 # 3.19 Non-ASCII multibyte space character U+3000. 1198 ('\xe3\x80\x80', 1199 ' '), 1200 # 3.20 ASCII control characters U+0010 U+007F. 1201 ('\x10\x7f', 1202 '\x10\x7f'), 1203 # 3.21 Non-ASCII 8bit control character U+0085. 1204 ('\xc2\x85', 1205 None), 1206 # 3.22 Non-ASCII multibyte control character U+180E. 1207 ('\xe1\xa0\x8e', 1208 None), 1209 # 3.23 Zero Width No-Break Space U+FEFF. 1210 ('\xef\xbb\xbf', 1211 ''), 1212 # 3.24 Non-ASCII control character U+1D175. 1213 ('\xf0\x9d\x85\xb5', 1214 None), 1215 # 3.25 Plane 0 private use character U+F123. 1216 ('\xef\x84\xa3', 1217 None), 1218 # 3.26 Plane 15 private use character U+F1234. 1219 ('\xf3\xb1\x88\xb4', 1220 None), 1221 # 3.27 Plane 16 private use character U+10F234. 1222 ('\xf4\x8f\x88\xb4', 1223 None), 1224 # 3.28 Non-character code point U+8FFFE. 1225 ('\xf2\x8f\xbf\xbe', 1226 None), 1227 # 3.29 Non-character code point U+10FFFF. 1228 ('\xf4\x8f\xbf\xbf', 1229 None), 1230 # 3.30 Surrogate code U+DF42. 1231 ('\xed\xbd\x82', 1232 None), 1233 # 3.31 Non-plain text character U+FFFD. 1234 ('\xef\xbf\xbd', 1235 None), 1236 # 3.32 Ideographic description character U+2FF5. 1237 ('\xe2\xbf\xb5', 1238 None), 1239 # 3.33 Display property character U+0341. 1240 ('\xcd\x81', 1241 '\xcc\x81'), 1242 # 3.34 Left-to-right mark U+200E. 1243 ('\xe2\x80\x8e', 1244 None), 1245 # 3.35 Deprecated U+202A. 1246 ('\xe2\x80\xaa', 1247 None), 1248 # 3.36 Language tagging character U+E0001. 1249 ('\xf3\xa0\x80\x81', 1250 None), 1251 # 3.37 Language tagging character U+E0042. 1252 ('\xf3\xa0\x81\x82', 1253 None), 1254 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1255 ('foo\xd6\xbebar', 1256 None), 1257 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1258 ('foo\xef\xb5\x90bar', 1259 None), 1260 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1261 ('foo\xef\xb9\xb6bar', 1262 'foo \xd9\x8ebar'), 1263 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1264 ('\xd8\xa71', 1265 None), 1266 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1267 ('\xd8\xa71\xd8\xa8', 1268 '\xd8\xa71\xd8\xa8'), 1269 # 3.43 Unassigned code point U+E0002. 1270 # Skip this test as we allow unassigned 1271 #('\xf3\xa0\x80\x82', 1272 # None), 1273 (None, None), 1274 # 3.44 Larger test (shrinking). 1275 # Original test case reads \xc3\xdf 1276 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1277 '\xaa\xce\xb0\xe2\x80\x80', 1278 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1279 # 3.45 Larger test (expanding). 1280 # Original test case reads \xc3\x9f 1281 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1282 '\x80', 1283 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1284 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1285 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1286 ] 1287 1288 1289 class NameprepTest(unittest.TestCase): 1290 def test_nameprep(self): 1291 from encodings.idna import nameprep 1292 for pos, (orig, prepped) in enumerate(nameprep_tests): 1293 if orig is None: 1294 # Skipped 1295 continue 1296 # The Unicode strings are given in UTF-8 1297 orig = unicode(orig, "utf-8") 1298 if prepped is None: 1299 # Input contains prohibited characters 1300 self.assertRaises(UnicodeError, nameprep, orig) 1301 else: 1302 prepped = unicode(prepped, "utf-8") 1303 try: 1304 self.assertEqual(nameprep(orig), prepped) 1305 except Exception,e: 1306 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1307 1308 class IDNACodecTest(unittest.TestCase): 1309 def test_builtin_decode(self): 1310 self.assertEqual(unicode("python.org", "idna"), u"python.org") 1311 self.assertEqual(unicode("python.org.", "idna"), u"python.org.") 1312 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org") 1313 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.") 1314 1315 def test_builtin_encode(self): 1316 self.assertEqual(u"python.org".encode("idna"), "python.org") 1317 self.assertEqual("python.org.".encode("idna"), "python.org.") 1318 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org") 1319 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.") 1320 1321 def test_stream(self): 1322 import StringIO 1323 r = codecs.getreader("idna")(StringIO.StringIO("abc")) 1324 r.read(3) 1325 self.assertEqual(r.read(), u"") 1326 1327 def test_incremental_decode(self): 1328 self.assertEqual( 1329 "".join(codecs.iterdecode("python.org", "idna")), 1330 u"python.org" 1331 ) 1332 self.assertEqual( 1333 "".join(codecs.iterdecode("python.org.", "idna")), 1334 u"python.org." 1335 ) 1336 self.assertEqual( 1337 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), 1338 u"pyth\xf6n.org." 1339 ) 1340 self.assertEqual( 1341 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), 1342 u"pyth\xf6n.org." 1343 ) 1344 1345 decoder = codecs.getincrementaldecoder("idna")() 1346 self.assertEqual(decoder.decode("xn--xam", ), u"") 1347 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.") 1348 self.assertEqual(decoder.decode(u"rg"), u"") 1349 self.assertEqual(decoder.decode(u"", True), u"org") 1350 1351 decoder.reset() 1352 self.assertEqual(decoder.decode("xn--xam", ), u"") 1353 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.") 1354 self.assertEqual(decoder.decode("rg."), u"org.") 1355 self.assertEqual(decoder.decode("", True), u"") 1356 1357 def test_incremental_encode(self): 1358 self.assertEqual( 1359 "".join(codecs.iterencode(u"python.org", "idna")), 1360 "python.org" 1361 ) 1362 self.assertEqual( 1363 "".join(codecs.iterencode(u"python.org.", "idna")), 1364 "python.org." 1365 ) 1366 self.assertEqual( 1367 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), 1368 "xn--pythn-mua.org." 1369 ) 1370 self.assertEqual( 1371 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), 1372 "xn--pythn-mua.org." 1373 ) 1374 1375 encoder = codecs.getincrementalencoder("idna")() 1376 self.assertEqual(encoder.encode(u"\xe4x"), "") 1377 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.") 1378 self.assertEqual(encoder.encode(u"", True), "org") 1379 1380 encoder.reset() 1381 self.assertEqual(encoder.encode(u"\xe4x"), "") 1382 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.") 1383 self.assertEqual(encoder.encode(u"", True), "") 1384 1385 class CodecsModuleTest(unittest.TestCase): 1386 1387 def test_decode(self): 1388 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'), 1389 u'\xe4\xf6\xfc') 1390 self.assertRaises(TypeError, codecs.decode) 1391 self.assertEqual(codecs.decode('abc'), u'abc') 1392 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii') 1393 1394 def test_encode(self): 1395 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'), 1396 '\xe4\xf6\xfc') 1397 self.assertRaises(TypeError, codecs.encode) 1398 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1399 self.assertEqual(codecs.encode(u'abc'), 'abc') 1400 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii') 1401 1402 def test_register(self): 1403 self.assertRaises(TypeError, codecs.register) 1404 self.assertRaises(TypeError, codecs.register, 42) 1405 1406 def test_lookup(self): 1407 self.assertRaises(TypeError, codecs.lookup) 1408 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1409 self.assertRaises(LookupError, codecs.lookup, " ") 1410 1411 def test_getencoder(self): 1412 self.assertRaises(TypeError, codecs.getencoder) 1413 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1414 1415 def test_getdecoder(self): 1416 self.assertRaises(TypeError, codecs.getdecoder) 1417 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1418 1419 def test_getreader(self): 1420 self.assertRaises(TypeError, codecs.getreader) 1421 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1422 1423 def test_getwriter(self): 1424 self.assertRaises(TypeError, codecs.getwriter) 1425 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1426 1427 def test_lookup_issue1813(self): 1428 # Issue #1813: under Turkish locales, lookup of some codecs failed 1429 # because 'I' is lowercased as a dotless "i" 1430 oldlocale = locale.getlocale(locale.LC_CTYPE) 1431 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1432 try: 1433 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1434 except locale.Error: 1435 # Unsupported locale on this system 1436 self.skipTest('test needs Turkish locale') 1437 c = codecs.lookup('ASCII') 1438 self.assertEqual(c.name, 'ascii') 1439 1440 def test_all(self): 1441 api = ( 1442 "encode", "decode", 1443 "register", "CodecInfo", "Codec", "IncrementalEncoder", 1444 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup", 1445 "getencoder", "getdecoder", "getincrementalencoder", 1446 "getincrementaldecoder", "getreader", "getwriter", 1447 "register_error", "lookup_error", 1448 "strict_errors", "replace_errors", "ignore_errors", 1449 "xmlcharrefreplace_errors", "backslashreplace_errors", 1450 "open", "EncodedFile", 1451 "iterencode", "iterdecode", 1452 "BOM", "BOM_BE", "BOM_LE", 1453 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE", 1454 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE", 1455 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented 1456 "StreamReaderWriter", "StreamRecoder", 1457 ) 1458 self.assertEqual(sorted(api), sorted(codecs.__all__)) 1459 for api in codecs.__all__: 1460 getattr(codecs, api) 1461 1462 class StreamReaderTest(unittest.TestCase): 1463 1464 def setUp(self): 1465 self.reader = codecs.getreader('utf-8') 1466 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') 1467 1468 def test_readlines(self): 1469 f = self.reader(self.stream) 1470 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00']) 1471 1472 class EncodedFileTest(unittest.TestCase): 1473 1474 def test_basic(self): 1475 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') 1476 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1477 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae') 1478 1479 f = StringIO.StringIO() 1480 ef = codecs.EncodedFile(f, 'utf-8', 'latin1') 1481 ef.write('\xc3\xbc') 1482 self.assertEqual(f.getvalue(), '\xfc') 1483 1484 class Str2StrTest(unittest.TestCase): 1485 1486 def test_read(self): 1487 sin = codecs.encode("\x80", "base64_codec") 1488 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) 1489 sout = reader.read() 1490 self.assertEqual(sout, "\x80") 1491 self.assertIsInstance(sout, str) 1492 1493 def test_readline(self): 1494 sin = codecs.encode("\x80", "base64_codec") 1495 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) 1496 sout = reader.readline() 1497 self.assertEqual(sout, "\x80") 1498 self.assertIsInstance(sout, str) 1499 1500 all_unicode_encodings = [ 1501 "ascii", 1502 "base64_codec", 1503 "big5", 1504 "big5hkscs", 1505 "charmap", 1506 "cp037", 1507 "cp1006", 1508 "cp1026", 1509 "cp1140", 1510 "cp1250", 1511 "cp1251", 1512 "cp1252", 1513 "cp1253", 1514 "cp1254", 1515 "cp1255", 1516 "cp1256", 1517 "cp1257", 1518 "cp1258", 1519 "cp424", 1520 "cp437", 1521 "cp500", 1522 "cp720", 1523 "cp737", 1524 "cp775", 1525 "cp850", 1526 "cp852", 1527 "cp855", 1528 "cp856", 1529 "cp857", 1530 "cp858", 1531 "cp860", 1532 "cp861", 1533 "cp862", 1534 "cp863", 1535 "cp864", 1536 "cp865", 1537 "cp866", 1538 "cp869", 1539 "cp874", 1540 "cp875", 1541 "cp932", 1542 "cp949", 1543 "cp950", 1544 "euc_jis_2004", 1545 "euc_jisx0213", 1546 "euc_jp", 1547 "euc_kr", 1548 "gb18030", 1549 "gb2312", 1550 "gbk", 1551 "hex_codec", 1552 "hp_roman8", 1553 "hz", 1554 "idna", 1555 "iso2022_jp", 1556 "iso2022_jp_1", 1557 "iso2022_jp_2", 1558 "iso2022_jp_2004", 1559 "iso2022_jp_3", 1560 "iso2022_jp_ext", 1561 "iso2022_kr", 1562 "iso8859_1", 1563 "iso8859_10", 1564 "iso8859_11", 1565 "iso8859_13", 1566 "iso8859_14", 1567 "iso8859_15", 1568 "iso8859_16", 1569 "iso8859_2", 1570 "iso8859_3", 1571 "iso8859_4", 1572 "iso8859_5", 1573 "iso8859_6", 1574 "iso8859_7", 1575 "iso8859_8", 1576 "iso8859_9", 1577 "johab", 1578 "koi8_r", 1579 "koi8_u", 1580 "latin_1", 1581 "mac_cyrillic", 1582 "mac_greek", 1583 "mac_iceland", 1584 "mac_latin2", 1585 "mac_roman", 1586 "mac_turkish", 1587 "palmos", 1588 "ptcp154", 1589 "punycode", 1590 "raw_unicode_escape", 1591 "rot_13", 1592 "shift_jis", 1593 "shift_jis_2004", 1594 "shift_jisx0213", 1595 "tis_620", 1596 "unicode_escape", 1597 "unicode_internal", 1598 "utf_16", 1599 "utf_16_be", 1600 "utf_16_le", 1601 "utf_7", 1602 "utf_8", 1603 ] 1604 1605 if hasattr(codecs, "mbcs_encode"): 1606 all_unicode_encodings.append("mbcs") 1607 1608 # The following encodings work only with str, not unicode 1609 all_string_encodings = [ 1610 "quopri_codec", 1611 "string_escape", 1612 "uu_codec", 1613 ] 1614 1615 # The following encoding is not tested, because it's not supposed 1616 # to work: 1617 # "undefined" 1618 1619 # The following encodings don't work in stateful mode 1620 broken_unicode_with_streams = [ 1621 "base64_codec", 1622 "hex_codec", 1623 "punycode", 1624 "unicode_internal" 1625 ] 1626 broken_incremental_coders = broken_unicode_with_streams[:] 1627 1628 if sys.flags.py3k_warning: 1629 broken_unicode_with_streams.append("rot_13") 1630 1631 # The following encodings only support "strict" mode 1632 only_strict_mode = [ 1633 "idna", 1634 "zlib_codec", 1635 "bz2_codec", 1636 ] 1637 1638 try: 1639 import bz2 1640 except ImportError: 1641 pass 1642 else: 1643 all_unicode_encodings.append("bz2_codec") 1644 broken_unicode_with_streams.append("bz2_codec") 1645 1646 try: 1647 import zlib 1648 except ImportError: 1649 pass 1650 else: 1651 all_unicode_encodings.append("zlib_codec") 1652 broken_unicode_with_streams.append("zlib_codec") 1653 1654 class BasicUnicodeTest(unittest.TestCase): 1655 def test_basics(self): 1656 s = u"abc123" # all codecs should be able to encode these 1657 for encoding in all_unicode_encodings: 1658 name = codecs.lookup(encoding).name 1659 if encoding.endswith("_codec"): 1660 name += "_codec" 1661 elif encoding == "latin_1": 1662 name = "latin_1" 1663 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) 1664 (bytes, size) = codecs.getencoder(encoding)(s) 1665 self.assertEqual(size, len(s), "encoding=%r" % encoding) 1666 (chars, size) = codecs.getdecoder(encoding)(bytes) 1667 self.assertEqual(chars, s, "encoding=%r" % encoding) 1668 1669 if encoding not in broken_unicode_with_streams: 1670 # check stream reader/writer 1671 q = Queue() 1672 writer = codecs.getwriter(encoding)(q) 1673 encodedresult = "" 1674 for c in s: 1675 writer.write(c) 1676 encodedresult += q.read() 1677 q = Queue() 1678 reader = codecs.getreader(encoding)(q) 1679 decodedresult = u"" 1680 for c in encodedresult: 1681 q.write(c) 1682 decodedresult += reader.read() 1683 self.assertEqual(decodedresult, s, "encoding=%r" % encoding) 1684 1685 if encoding not in broken_incremental_coders: 1686 # check incremental decoder/encoder and iterencode()/iterdecode() 1687 try: 1688 encoder = codecs.getincrementalencoder(encoding)() 1689 except LookupError: # no IncrementalEncoder 1690 pass 1691 else: 1692 # check incremental decoder/encoder 1693 encodedresult = "" 1694 for c in s: 1695 encodedresult += encoder.encode(c) 1696 encodedresult += encoder.encode(u"", True) 1697 decoder = codecs.getincrementaldecoder(encoding)() 1698 decodedresult = u"" 1699 for c in encodedresult: 1700 decodedresult += decoder.decode(c) 1701 decodedresult += decoder.decode("", True) 1702 self.assertEqual(decodedresult, s, 1703 "encoding=%r" % encoding) 1704 1705 # check iterencode()/iterdecode() 1706 result = u"".join(codecs.iterdecode( 1707 codecs.iterencode(s, encoding), encoding)) 1708 self.assertEqual(result, s, "encoding=%r" % encoding) 1709 1710 # check iterencode()/iterdecode() with empty string 1711 result = u"".join(codecs.iterdecode( 1712 codecs.iterencode(u"", encoding), encoding)) 1713 self.assertEqual(result, u"") 1714 1715 if encoding not in only_strict_mode: 1716 # check incremental decoder/encoder with errors argument 1717 try: 1718 encoder = codecs.getincrementalencoder(encoding)("ignore") 1719 except LookupError: # no IncrementalEncoder 1720 pass 1721 else: 1722 encodedresult = "".join(encoder.encode(c) for c in s) 1723 decoder = codecs.getincrementaldecoder(encoding)("ignore") 1724 decodedresult = u"".join(decoder.decode(c) 1725 for c in encodedresult) 1726 self.assertEqual(decodedresult, s, 1727 "encoding=%r" % encoding) 1728 1729 @test_support.cpython_only 1730 def test_basics_capi(self): 1731 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder 1732 s = u"abc123" # all codecs should be able to encode these 1733 for encoding in all_unicode_encodings: 1734 if encoding not in broken_incremental_coders: 1735 # check incremental decoder/encoder and iterencode()/iterdecode() 1736 try: 1737 cencoder = codec_incrementalencoder(encoding) 1738 except LookupError: # no IncrementalEncoder 1739 pass 1740 else: 1741 # check C API 1742 encodedresult = "" 1743 for c in s: 1744 encodedresult += cencoder.encode(c) 1745 encodedresult += cencoder.encode(u"", True) 1746 cdecoder = codec_incrementaldecoder(encoding) 1747 decodedresult = u"" 1748 for c in encodedresult: 1749 decodedresult += cdecoder.decode(c) 1750 decodedresult += cdecoder.decode("", True) 1751 self.assertEqual(decodedresult, s, 1752 "encoding=%r" % encoding) 1753 1754 if encoding not in only_strict_mode: 1755 # check incremental decoder/encoder with errors argument 1756 try: 1757 cencoder = codec_incrementalencoder(encoding, "ignore") 1758 except LookupError: # no IncrementalEncoder 1759 pass 1760 else: 1761 encodedresult = "".join(cencoder.encode(c) for c in s) 1762 cdecoder = codec_incrementaldecoder(encoding, "ignore") 1763 decodedresult = u"".join(cdecoder.decode(c) 1764 for c in encodedresult) 1765 self.assertEqual(decodedresult, s, 1766 "encoding=%r" % encoding) 1767 1768 def test_seek(self): 1769 # all codecs should be able to encode these 1770 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456") 1771 for encoding in all_unicode_encodings: 1772 if encoding == "idna": # FIXME: See SF bug #1163178 1773 continue 1774 if encoding in broken_unicode_with_streams: 1775 continue 1776 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding))) 1777 for t in xrange(5): 1778 # Test that calling seek resets the internal codec state and buffers 1779 reader.seek(0, 0) 1780 line = reader.readline() 1781 self.assertEqual(s[:len(line)], line) 1782 1783 def test_bad_decode_args(self): 1784 for encoding in all_unicode_encodings: 1785 decoder = codecs.getdecoder(encoding) 1786 self.assertRaises(TypeError, decoder) 1787 if encoding not in ("idna", "punycode"): 1788 self.assertRaises(TypeError, decoder, 42) 1789 1790 def test_bad_encode_args(self): 1791 for encoding in all_unicode_encodings: 1792 encoder = codecs.getencoder(encoding) 1793 self.assertRaises(TypeError, encoder) 1794 1795 def test_encoding_map_type_initialized(self): 1796 from encodings import cp1140 1797 # This used to crash, we are only verifying there's no crash. 1798 table_type = type(cp1140.encoding_table) 1799 self.assertEqual(table_type, table_type) 1800 1801 class BasicStrTest(unittest.TestCase): 1802 def test_basics(self): 1803 s = "abc123" 1804 for encoding in all_string_encodings: 1805 (bytes, size) = codecs.getencoder(encoding)(s) 1806 self.assertEqual(size, len(s)) 1807 (chars, size) = codecs.getdecoder(encoding)(bytes) 1808 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) 1809 1810 class CharmapTest(unittest.TestCase): 1811 def test_decode_with_string_map(self): 1812 self.assertEqual( 1813 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"), 1814 (u"abc", 3) 1815 ) 1816 1817 self.assertRaises(UnicodeDecodeError, 1818 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab" 1819 ) 1820 1821 self.assertRaises(UnicodeDecodeError, 1822 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe" 1823 ) 1824 1825 self.assertEqual( 1826 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), 1827 (u"ab\ufffd", 3) 1828 ) 1829 1830 self.assertEqual( 1831 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"), 1832 (u"ab\ufffd", 3) 1833 ) 1834 1835 self.assertEqual( 1836 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"), 1837 (u"ab", 3) 1838 ) 1839 1840 self.assertEqual( 1841 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), 1842 (u"ab", 3) 1843 ) 1844 1845 allbytes = "".join(chr(i) for i in xrange(256)) 1846 self.assertEqual( 1847 codecs.charmap_decode(allbytes, "ignore", u""), 1848 (u"", len(allbytes)) 1849 ) 1850 1851 def test_decode_with_int2str_map(self): 1852 self.assertEqual( 1853 codecs.charmap_decode("\x00\x01\x02", "strict", 1854 {0: u'a', 1: u'b', 2: u'c'}), 1855 (u"abc", 3) 1856 ) 1857 1858 self.assertEqual( 1859 codecs.charmap_decode("\x00\x01\x02", "strict", 1860 {0: u'Aa', 1: u'Bb', 2: u'Cc'}), 1861 (u"AaBbCc", 3) 1862 ) 1863 1864 self.assertEqual( 1865 codecs.charmap_decode("\x00\x01\x02", "strict", 1866 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}), 1867 (u"\U0010FFFFbc", 3) 1868 ) 1869 1870 self.assertEqual( 1871 codecs.charmap_decode("\x00\x01\x02", "strict", 1872 {0: u'a', 1: u'b', 2: u''}), 1873 (u"ab", 3) 1874 ) 1875 1876 self.assertRaises(UnicodeDecodeError, 1877 codecs.charmap_decode, "\x00\x01\x02", "strict", 1878 {0: u'a', 1: u'b'} 1879 ) 1880 1881 self.assertRaises(UnicodeDecodeError, 1882 codecs.charmap_decode, "\x00\x01\x02", "strict", 1883 {0: u'a', 1: u'b', 2: None} 1884 ) 1885 1886 # Issue #14850 1887 self.assertRaises(UnicodeDecodeError, 1888 codecs.charmap_decode, "\x00\x01\x02", "strict", 1889 {0: u'a', 1: u'b', 2: u'\ufffe'} 1890 ) 1891 1892 self.assertEqual( 1893 codecs.charmap_decode("\x00\x01\x02", "replace", 1894 {0: u'a', 1: u'b'}), 1895 (u"ab\ufffd", 3) 1896 ) 1897 1898 self.assertEqual( 1899 codecs.charmap_decode("\x00\x01\x02", "replace", 1900 {0: u'a', 1: u'b', 2: None}), 1901 (u"ab\ufffd", 3) 1902 ) 1903 1904 # Issue #14850 1905 self.assertEqual( 1906 codecs.charmap_decode("\x00\x01\x02", "replace", 1907 {0: u'a', 1: u'b', 2: u'\ufffe'}), 1908 (u"ab\ufffd", 3) 1909 ) 1910 1911 self.assertEqual( 1912 codecs.charmap_decode("\x00\x01\x02", "ignore", 1913 {0: u'a', 1: u'b'}), 1914 (u"ab", 3) 1915 ) 1916 1917 self.assertEqual( 1918 codecs.charmap_decode("\x00\x01\x02", "ignore", 1919 {0: u'a', 1: u'b', 2: None}), 1920 (u"ab", 3) 1921 ) 1922 1923 # Issue #14850 1924 self.assertEqual( 1925 codecs.charmap_decode("\x00\x01\x02", "ignore", 1926 {0: u'a', 1: u'b', 2: u'\ufffe'}), 1927 (u"ab", 3) 1928 ) 1929 1930 allbytes = "".join(chr(i) for i in xrange(256)) 1931 self.assertEqual( 1932 codecs.charmap_decode(allbytes, "ignore", {}), 1933 (u"", len(allbytes)) 1934 ) 1935 1936 def test_decode_with_int2int_map(self): 1937 a = ord(u'a') 1938 b = ord(u'b') 1939 c = ord(u'c') 1940 1941 self.assertEqual( 1942 codecs.charmap_decode("\x00\x01\x02", "strict", 1943 {0: a, 1: b, 2: c}), 1944 (u"abc", 3) 1945 ) 1946 1947 # Issue #15379 1948 self.assertEqual( 1949 codecs.charmap_decode("\x00\x01\x02", "strict", 1950 {0: 0x10FFFF, 1: b, 2: c}), 1951 (u"\U0010FFFFbc", 3) 1952 ) 1953 1954 self.assertRaises(TypeError, 1955 codecs.charmap_decode, "\x00\x01\x02", "strict", 1956 {0: 0x110000, 1: b, 2: c} 1957 ) 1958 1959 self.assertRaises(UnicodeDecodeError, 1960 codecs.charmap_decode, "\x00\x01\x02", "strict", 1961 {0: a, 1: b}, 1962 ) 1963 1964 self.assertRaises(UnicodeDecodeError, 1965 codecs.charmap_decode, "\x00\x01\x02", "strict", 1966 {0: a, 1: b, 2: 0xFFFE}, 1967 ) 1968 1969 self.assertEqual( 1970 codecs.charmap_decode("\x00\x01\x02", "replace", 1971 {0: a, 1: b}), 1972 (u"ab\ufffd", 3) 1973 ) 1974 1975 self.assertEqual( 1976 codecs.charmap_decode("\x00\x01\x02", "replace", 1977 {0: a, 1: b, 2: 0xFFFE}), 1978 (u"ab\ufffd", 3) 1979 ) 1980 1981 self.assertEqual( 1982 codecs.charmap_decode("\x00\x01\x02", "ignore", 1983 {0: a, 1: b}), 1984 (u"ab", 3) 1985 ) 1986 1987 self.assertEqual( 1988 codecs.charmap_decode("\x00\x01\x02", "ignore", 1989 {0: a, 1: b, 2: 0xFFFE}), 1990 (u"ab", 3) 1991 ) 1992 1993 1994 class WithStmtTest(unittest.TestCase): 1995 def test_encodedfile(self): 1996 f = StringIO.StringIO("\xc3\xbc") 1997 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 1998 self.assertEqual(ef.read(), "\xfc") 1999 2000 def test_streamreaderwriter(self): 2001 f = StringIO.StringIO("\xc3\xbc") 2002 info = codecs.lookup("utf-8") 2003 with codecs.StreamReaderWriter(f, info.streamreader, 2004 info.streamwriter, 'strict') as srw: 2005 self.assertEqual(srw.read(), u"\xfc") 2006 2007 2008 class UnicodeEscapeTest(unittest.TestCase): 2009 def test_empty(self): 2010 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0)) 2011 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0)) 2012 2013 def test_raw_encode(self): 2014 encode = codecs.unicode_escape_encode 2015 for b in range(32, 127): 2016 if b != ord('\\'): 2017 self.assertEqual(encode(unichr(b)), (chr(b), 1)) 2018 2019 def test_raw_decode(self): 2020 decode = codecs.unicode_escape_decode 2021 for b in range(256): 2022 if b != ord('\\'): 2023 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2)) 2024 2025 def test_escape_encode(self): 2026 encode = codecs.unicode_escape_encode 2027 check = coding_checker(self, encode) 2028 check(u'\t', r'\t') 2029 check(u'\n', r'\n') 2030 check(u'\r', r'\r') 2031 check(u'\\', r'\\') 2032 for b in range(32): 2033 if chr(b) not in '\t\n\r': 2034 check(unichr(b), '\\x%02x' % b) 2035 for b in range(127, 256): 2036 check(unichr(b), '\\x%02x' % b) 2037 check(u'\u20ac', r'\u20ac') 2038 check(u'\U0001d120', r'\U0001d120') 2039 2040 def test_escape_decode(self): 2041 decode = codecs.unicode_escape_decode 2042 check = coding_checker(self, decode) 2043 check("[\\\n]", u"[]") 2044 check(r'[\"]', u'["]') 2045 check(r"[\']", u"[']") 2046 check(r"[\\]", ur"[\]") 2047 check(r"[\a]", u"[\x07]") 2048 check(r"[\b]", u"[\x08]") 2049 check(r"[\t]", u"[\x09]") 2050 check(r"[\n]", u"[\x0a]") 2051 check(r"[\v]", u"[\x0b]") 2052 check(r"[\f]", u"[\x0c]") 2053 check(r"[\r]", u"[\x0d]") 2054 check(r"[\7]", u"[\x07]") 2055 check(r"[\8]", ur"[\8]") 2056 check(r"[\78]", u"[\x078]") 2057 check(r"[\41]", u"[!]") 2058 check(r"[\418]", u"[!8]") 2059 check(r"[\101]", u"[A]") 2060 check(r"[\1010]", u"[A0]") 2061 check(r"[\x41]", u"[A]") 2062 check(r"[\x410]", u"[A0]") 2063 check(r"\u20ac", u"\u20ac") 2064 check(r"\U0001d120", u"\U0001d120") 2065 for b in range(256): 2066 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN': 2067 check('\\' + chr(b), u'\\' + unichr(b)) 2068 2069 def test_decode_errors(self): 2070 decode = codecs.unicode_escape_decode 2071 for c, d in ('x', 2), ('u', 4), ('U', 4): 2072 for i in range(d): 2073 self.assertRaises(UnicodeDecodeError, decode, 2074 "\\" + c + "0"*i) 2075 self.assertRaises(UnicodeDecodeError, decode, 2076 "[\\" + c + "0"*i + "]") 2077 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i 2078 self.assertEqual(decode(data, "ignore"), (u"[]", len(data))) 2079 self.assertEqual(decode(data, "replace"), 2080 (u"[\ufffd]\ufffd", len(data))) 2081 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000") 2082 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10)) 2083 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10)) 2084 2085 2086 class RawUnicodeEscapeTest(unittest.TestCase): 2087 def test_empty(self): 2088 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0)) 2089 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0)) 2090 2091 def test_raw_encode(self): 2092 encode = codecs.raw_unicode_escape_encode 2093 for b in range(256): 2094 self.assertEqual(encode(unichr(b)), (chr(b), 1)) 2095 2096 def test_raw_decode(self): 2097 decode = codecs.raw_unicode_escape_decode 2098 for b in range(256): 2099 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2)) 2100 2101 def test_escape_encode(self): 2102 encode = codecs.raw_unicode_escape_encode 2103 check = coding_checker(self, encode) 2104 for b in range(256): 2105 if chr(b) not in 'uU': 2106 check(u'\\' + unichr(b), '\\' + chr(b)) 2107 check(u'\u20ac', r'\u20ac') 2108 check(u'\U0001d120', r'\U0001d120') 2109 2110 def test_escape_decode(self): 2111 decode = codecs.raw_unicode_escape_decode 2112 check = coding_checker(self, decode) 2113 for b in range(256): 2114 if chr(b) not in 'uU': 2115 check('\\' + chr(b), u'\\' + unichr(b)) 2116 check(r"\u20ac", u"\u20ac") 2117 check(r"\U0001d120", u"\U0001d120") 2118 2119 def test_decode_errors(self): 2120 decode = codecs.raw_unicode_escape_decode 2121 for c, d in ('u', 4), ('U', 4): 2122 for i in range(d): 2123 self.assertRaises(UnicodeDecodeError, decode, 2124 "\\" + c + "0"*i) 2125 self.assertRaises(UnicodeDecodeError, decode, 2126 "[\\" + c + "0"*i + "]") 2127 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i 2128 self.assertEqual(decode(data, "ignore"), (u"[]", len(data))) 2129 self.assertEqual(decode(data, "replace"), 2130 (u"[\ufffd]\ufffd", len(data))) 2131 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000") 2132 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10)) 2133 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10)) 2134 2135 2136 class BomTest(unittest.TestCase): 2137 def test_seek0(self): 2138 data = u"1234567890" 2139 tests = ("utf-16", 2140 "utf-16-le", 2141 "utf-16-be", 2142 "utf-32", 2143 "utf-32-le", 2144 "utf-32-be") 2145 self.addCleanup(test_support.unlink, test_support.TESTFN) 2146 for encoding in tests: 2147 # Check if the BOM is written only once 2148 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 2149 f.write(data) 2150 f.write(data) 2151 f.seek(0) 2152 self.assertEqual(f.read(), data * 2) 2153 f.seek(0) 2154 self.assertEqual(f.read(), data * 2) 2155 2156 # Check that the BOM is written after a seek(0) 2157 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 2158 f.write(data[0]) 2159 self.assertNotEqual(f.tell(), 0) 2160 f.seek(0) 2161 f.write(data) 2162 f.seek(0) 2163 self.assertEqual(f.read(), data) 2164 2165 # (StreamWriter) Check that the BOM is written after a seek(0) 2166 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 2167 f.writer.write(data[0]) 2168 self.assertNotEqual(f.writer.tell(), 0) 2169 f.writer.seek(0) 2170 f.writer.write(data) 2171 f.seek(0) 2172 self.assertEqual(f.read(), data) 2173 2174 # Check that the BOM is not written after a seek() at a position 2175 # different than the start 2176 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 2177 f.write(data) 2178 f.seek(f.tell()) 2179 f.write(data) 2180 f.seek(0) 2181 self.assertEqual(f.read(), data * 2) 2182 2183 # (StreamWriter) Check that the BOM is not written after a seek() 2184 # at a position different than the start 2185 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 2186 f.writer.write(data) 2187 f.writer.seek(f.writer.tell()) 2188 f.writer.write(data) 2189 f.seek(0) 2190 self.assertEqual(f.read(), data * 2) 2191 2192 2193 class TransformCodecTest(unittest.TestCase): 2194 2195 def test_quopri_stateless(self): 2196 # Should encode with quotetabs=True 2197 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec") 2198 self.assertEqual(encoded, b"space=20tab=09eol=20\n") 2199 # But should still support unescaped tabs and spaces 2200 unescaped = b"space tab eol\n" 2201 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped) 2202 2203 def test_uu_invalid(self): 2204 # Missing "begin" line 2205 self.assertRaises(ValueError, codecs.decode, "", "uu-codec") 2206 2207 2208 def test_main(): 2209 test_support.run_unittest( 2210 UTF32Test, 2211 UTF32LETest, 2212 UTF32BETest, 2213 UTF16Test, 2214 UTF16LETest, 2215 UTF16BETest, 2216 UTF8Test, 2217 UTF8SigTest, 2218 UTF7Test, 2219 UTF16ExTest, 2220 ReadBufferTest, 2221 CharBufferTest, 2222 EscapeDecodeTest, 2223 RecodingTest, 2224 PunycodeTest, 2225 UnicodeInternalTest, 2226 NameprepTest, 2227 IDNACodecTest, 2228 CodecsModuleTest, 2229 StreamReaderTest, 2230 EncodedFileTest, 2231 Str2StrTest, 2232 BasicUnicodeTest, 2233 BasicStrTest, 2234 CharmapTest, 2235 WithStmtTest, 2236 UnicodeEscapeTest, 2237 RawUnicodeEscapeTest, 2238 BomTest, 2239 TransformCodecTest, 2240 ) 2241 2242 2243 if __name__ == "__main__": 2244 test_main() 2245