1 from test import test_support 2 import unittest 3 import codecs 4 import locale 5 import sys, StringIO, _testcapi 6 7 def coding_checker(self, coder): 8 def check(input, expect): 9 self.assertEqual(coder(input), (expect, len(input))) 10 return check 11 12 class Queue(object): 13 """ 14 queue: write bytes at one end, read bytes from the other end 15 """ 16 def __init__(self): 17 self._buffer = "" 18 19 def write(self, chars): 20 self._buffer += chars 21 22 def read(self, size=-1): 23 if size<0: 24 s = self._buffer 25 self._buffer = "" 26 return s 27 else: 28 s = self._buffer[:size] 29 self._buffer = self._buffer[size:] 30 return s 31 32 class ReadTest(unittest.TestCase): 33 def check_partial(self, input, partialresults): 34 # get a StreamReader for the encoding and feed the bytestring version 35 # of input to the reader byte by byte. Read everything available from 36 # the StreamReader and check that the results equal the appropriate 37 # entries from partialresults. 38 q = Queue() 39 r = codecs.getreader(self.encoding)(q) 40 result = u"" 41 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 42 q.write(c) 43 result += r.read() 44 self.assertEqual(result, partialresult) 45 # check that there's nothing left in the buffers 46 self.assertEqual(r.read(), u"") 47 self.assertEqual(r.bytebuffer, "") 48 self.assertEqual(r.charbuffer, u"") 49 50 # do the check again, this time using a incremental decoder 51 d = codecs.getincrementaldecoder(self.encoding)() 52 result = u"" 53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 54 result += d.decode(c) 55 self.assertEqual(result, partialresult) 56 # check that there's nothing left in the buffers 57 self.assertEqual(d.decode("", True), u"") 58 self.assertEqual(d.buffer, "") 59 60 # Check whether the reset method works properly 61 d.reset() 62 result = u"" 63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults): 64 result += d.decode(c) 65 self.assertEqual(result, partialresult) 66 # check that there's nothing left in the buffers 67 self.assertEqual(d.decode("", True), u"") 68 self.assertEqual(d.buffer, "") 69 70 # check iterdecode() 71 encoded = input.encode(self.encoding) 72 self.assertEqual( 73 input, 74 u"".join(codecs.iterdecode(encoded, self.encoding)) 75 ) 76 77 def test_readline(self): 78 def getreader(input): 79 stream = StringIO.StringIO(input.encode(self.encoding)) 80 return codecs.getreader(self.encoding)(stream) 81 82 def readalllines(input, keepends=True, size=None): 83 reader = getreader(input) 84 lines = [] 85 while True: 86 line = reader.readline(size=size, keepends=keepends) 87 if not line: 88 break 89 lines.append(line) 90 return "|".join(lines) 91 92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs" 93 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs" 94 sexpectednoends = u"foo|bar|baz|spam|eggs" 95 self.assertEqual(readalllines(s, True), sexpected) 96 self.assertEqual(readalllines(s, False), sexpectednoends) 97 self.assertEqual(readalllines(s, True, 10), sexpected) 98 self.assertEqual(readalllines(s, False, 10), sexpectednoends) 99 100 # Test long lines (multiple calls to read() in readline()) 101 vw = [] 102 vwo = [] 103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()): 104 vw.append((i*200)*u"\3042" + lineend) 105 vwo.append((i*200)*u"\3042") 106 self.assertEqual(readalllines("".join(vw), True), "".join(vw)) 107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo)) 108 109 # Test lines where the first read might end with \r, so the 110 # reader has to look ahead whether this is a lone \r or a \r\n 111 for size in xrange(80): 112 for lineend in u"\n \r\n \r \u2028".split(): 113 s = 10*(size*u"a" + lineend + u"xxx\n") 114 reader = getreader(s) 115 for i in xrange(10): 116 self.assertEqual( 117 reader.readline(keepends=True), 118 size*u"a" + lineend, 119 ) 120 reader = getreader(s) 121 for i in xrange(10): 122 self.assertEqual( 123 reader.readline(keepends=False), 124 size*u"a", 125 ) 126 127 def test_bug1175396(self): 128 s = [ 129 '<%!--===================================================\r\n', 130 ' BLOG index page: show recent articles,\r\n', 131 ' today\'s articles, or articles of a specific date.\r\n', 132 '========================================================--%>\r\n', 133 '<%@inputencoding="ISO-8859-1"%>\r\n', 134 '<%@pagetemplate=TEMPLATE.y%>\r\n', 135 '<%@import=import frog.util, frog%>\r\n', 136 '<%@import=import frog.objects%>\r\n', 137 '<%@import=from frog.storageerrors import StorageError%>\r\n', 138 '<%\r\n', 139 '\r\n', 140 'import logging\r\n', 141 'log=logging.getLogger("Snakelets.logger")\r\n', 142 '\r\n', 143 '\r\n', 144 'user=self.SessionCtx.user\r\n', 145 'storageEngine=self.SessionCtx.storageEngine\r\n', 146 '\r\n', 147 '\r\n', 148 'def readArticlesFromDate(date, count=None):\r\n', 149 ' entryids=storageEngine.listBlogEntries(date)\r\n', 150 ' entryids.reverse() # descending\r\n', 151 ' if count:\r\n', 152 ' entryids=entryids[:count]\r\n', 153 ' try:\r\n', 154 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', 155 ' except StorageError,x:\r\n', 156 ' log.error("Error loading articles: "+str(x))\r\n', 157 ' self.abort("cannot load articles")\r\n', 158 '\r\n', 159 'showdate=None\r\n', 160 '\r\n', 161 'arg=self.Request.getArg()\r\n', 162 'if arg=="today":\r\n', 163 ' #-------------------- TODAY\'S ARTICLES\r\n', 164 ' self.write("<h2>Today\'s articles</h2>")\r\n', 165 ' showdate = frog.util.isodatestr() \r\n', 166 ' entries = readArticlesFromDate(showdate)\r\n', 167 'elif arg=="active":\r\n', 168 ' #-------------------- ACTIVE ARTICLES redirect\r\n', 169 ' self.Yredirect("active.y")\r\n', 170 'elif arg=="login":\r\n', 171 ' #-------------------- LOGIN PAGE redirect\r\n', 172 ' self.Yredirect("login.y")\r\n', 173 'elif arg=="date":\r\n', 174 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', 175 ' showdate = self.Request.getParameter("date")\r\n', 176 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', 177 ' entries = readArticlesFromDate(showdate)\r\n', 178 'else:\r\n', 179 ' #-------------------- RECENT ARTICLES\r\n', 180 ' self.write("<h2>Recent articles</h2>")\r\n', 181 ' dates=storageEngine.listBlogEntryDates()\r\n', 182 ' if dates:\r\n', 183 ' entries=[]\r\n', 184 ' SHOWAMOUNT=10\r\n', 185 ' for showdate in dates:\r\n', 186 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', 187 ' if len(entries)>=SHOWAMOUNT:\r\n', 188 ' break\r\n', 189 ' \r\n', 190 ] 191 stream = StringIO.StringIO("".join(s).encode(self.encoding)) 192 reader = codecs.getreader(self.encoding)(stream) 193 for (i, line) in enumerate(reader): 194 self.assertEqual(line, s[i]) 195 196 def test_readlinequeue(self): 197 q = Queue() 198 writer = codecs.getwriter(self.encoding)(q) 199 reader = codecs.getreader(self.encoding)(q) 200 201 # No lineends 202 writer.write(u"foo\r") 203 self.assertEqual(reader.readline(keepends=False), u"foo") 204 writer.write(u"\nbar\r") 205 self.assertEqual(reader.readline(keepends=False), u"") 206 self.assertEqual(reader.readline(keepends=False), u"bar") 207 writer.write(u"baz") 208 self.assertEqual(reader.readline(keepends=False), u"baz") 209 self.assertEqual(reader.readline(keepends=False), u"") 210 211 # Lineends 212 writer.write(u"foo\r") 213 self.assertEqual(reader.readline(keepends=True), u"foo\r") 214 writer.write(u"\nbar\r") 215 self.assertEqual(reader.readline(keepends=True), u"\n") 216 self.assertEqual(reader.readline(keepends=True), u"bar\r") 217 writer.write(u"baz") 218 self.assertEqual(reader.readline(keepends=True), u"baz") 219 self.assertEqual(reader.readline(keepends=True), u"") 220 writer.write(u"foo\r\n") 221 self.assertEqual(reader.readline(keepends=True), u"foo\r\n") 222 223 def test_bug1098990_a(self): 224 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n" 225 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n" 226 s3 = u"next line.\r\n" 227 228 s = (s1+s2+s3).encode(self.encoding) 229 stream = StringIO.StringIO(s) 230 reader = codecs.getreader(self.encoding)(stream) 231 self.assertEqual(reader.readline(), s1) 232 self.assertEqual(reader.readline(), s2) 233 self.assertEqual(reader.readline(), s3) 234 self.assertEqual(reader.readline(), u"") 235 236 def test_bug1098990_b(self): 237 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n" 238 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n" 239 s3 = u"stillokay:bbbbxx\r\n" 240 s4 = u"broken!!!!badbad\r\n" 241 s5 = u"againokay.\r\n" 242 243 s = (s1+s2+s3+s4+s5).encode(self.encoding) 244 stream = StringIO.StringIO(s) 245 reader = codecs.getreader(self.encoding)(stream) 246 self.assertEqual(reader.readline(), s1) 247 self.assertEqual(reader.readline(), s2) 248 self.assertEqual(reader.readline(), s3) 249 self.assertEqual(reader.readline(), s4) 250 self.assertEqual(reader.readline(), s5) 251 self.assertEqual(reader.readline(), u"") 252 253 class UTF32Test(ReadTest): 254 encoding = "utf-32" 255 256 spamle = ('\xff\xfe\x00\x00' 257 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' 258 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') 259 spambe = ('\x00\x00\xfe\xff' 260 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' 261 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') 262 263 def test_only_one_bom(self): 264 _,_,reader,writer = codecs.lookup(self.encoding) 265 # encode some stream 266 s = StringIO.StringIO() 267 f = writer(s) 268 f.write(u"spam") 269 f.write(u"spam") 270 d = s.getvalue() 271 # check whether there is exactly one BOM in it 272 self.assertTrue(d == self.spamle or d == self.spambe) 273 # try to read it back 274 s = StringIO.StringIO(d) 275 f = reader(s) 276 self.assertEqual(f.read(), u"spamspam") 277 278 def test_badbom(self): 279 s = StringIO.StringIO(4*"\xff") 280 f = codecs.getreader(self.encoding)(s) 281 self.assertRaises(UnicodeError, f.read) 282 283 s = StringIO.StringIO(8*"\xff") 284 f = codecs.getreader(self.encoding)(s) 285 self.assertRaises(UnicodeError, f.read) 286 287 def test_partial(self): 288 self.check_partial( 289 u"\x00\xff\u0100\uffff\U00010000", 290 [ 291 u"", # first byte of BOM read 292 u"", # second byte of BOM read 293 u"", # third byte of BOM read 294 u"", # fourth byte of BOM read => byteorder known 295 u"", 296 u"", 297 u"", 298 u"\x00", 299 u"\x00", 300 u"\x00", 301 u"\x00", 302 u"\x00\xff", 303 u"\x00\xff", 304 u"\x00\xff", 305 u"\x00\xff", 306 u"\x00\xff\u0100", 307 u"\x00\xff\u0100", 308 u"\x00\xff\u0100", 309 u"\x00\xff\u0100", 310 u"\x00\xff\u0100\uffff", 311 u"\x00\xff\u0100\uffff", 312 u"\x00\xff\u0100\uffff", 313 u"\x00\xff\u0100\uffff", 314 u"\x00\xff\u0100\uffff\U00010000", 315 ] 316 ) 317 318 def test_handlers(self): 319 self.assertEqual((u'\ufffd', 1), 320 codecs.utf_32_decode('\x01', 'replace', True)) 321 self.assertEqual((u'', 1), 322 codecs.utf_32_decode('\x01', 'ignore', True)) 323 324 def test_errors(self): 325 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, 326 "\xff", "strict", True) 327 328 def test_issue8941(self): 329 # Issue #8941: insufficient result allocation when decoding into 330 # surrogate pairs on UCS-2 builds. 331 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024 332 self.assertEqual(u'\U00010000' * 1024, 333 codecs.utf_32_decode(encoded_le)[0]) 334 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024 335 self.assertEqual(u'\U00010000' * 1024, 336 codecs.utf_32_decode(encoded_be)[0]) 337 338 class UTF32LETest(ReadTest): 339 encoding = "utf-32-le" 340 341 def test_partial(self): 342 self.check_partial( 343 u"\x00\xff\u0100\uffff\U00010000", 344 [ 345 u"", 346 u"", 347 u"", 348 u"\x00", 349 u"\x00", 350 u"\x00", 351 u"\x00", 352 u"\x00\xff", 353 u"\x00\xff", 354 u"\x00\xff", 355 u"\x00\xff", 356 u"\x00\xff\u0100", 357 u"\x00\xff\u0100", 358 u"\x00\xff\u0100", 359 u"\x00\xff\u0100", 360 u"\x00\xff\u0100\uffff", 361 u"\x00\xff\u0100\uffff", 362 u"\x00\xff\u0100\uffff", 363 u"\x00\xff\u0100\uffff", 364 u"\x00\xff\u0100\uffff\U00010000", 365 ] 366 ) 367 368 def test_simple(self): 369 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00") 370 371 def test_errors(self): 372 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, 373 "\xff", "strict", True) 374 375 def test_issue8941(self): 376 # Issue #8941: insufficient result allocation when decoding into 377 # surrogate pairs on UCS-2 builds. 378 encoded = '\x00\x00\x01\x00' * 1024 379 self.assertEqual(u'\U00010000' * 1024, 380 codecs.utf_32_le_decode(encoded)[0]) 381 382 class UTF32BETest(ReadTest): 383 encoding = "utf-32-be" 384 385 def test_partial(self): 386 self.check_partial( 387 u"\x00\xff\u0100\uffff\U00010000", 388 [ 389 u"", 390 u"", 391 u"", 392 u"\x00", 393 u"\x00", 394 u"\x00", 395 u"\x00", 396 u"\x00\xff", 397 u"\x00\xff", 398 u"\x00\xff", 399 u"\x00\xff", 400 u"\x00\xff\u0100", 401 u"\x00\xff\u0100", 402 u"\x00\xff\u0100", 403 u"\x00\xff\u0100", 404 u"\x00\xff\u0100\uffff", 405 u"\x00\xff\u0100\uffff", 406 u"\x00\xff\u0100\uffff", 407 u"\x00\xff\u0100\uffff", 408 u"\x00\xff\u0100\uffff\U00010000", 409 ] 410 ) 411 412 def test_simple(self): 413 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03") 414 415 def test_errors(self): 416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, 417 "\xff", "strict", True) 418 419 def test_issue8941(self): 420 # Issue #8941: insufficient result allocation when decoding into 421 # surrogate pairs on UCS-2 builds. 422 encoded = '\x00\x01\x00\x00' * 1024 423 self.assertEqual(u'\U00010000' * 1024, 424 codecs.utf_32_be_decode(encoded)[0]) 425 426 427 class UTF16Test(ReadTest): 428 encoding = "utf-16" 429 430 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' 431 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' 432 433 def test_only_one_bom(self): 434 _,_,reader,writer = codecs.lookup(self.encoding) 435 # encode some stream 436 s = StringIO.StringIO() 437 f = writer(s) 438 f.write(u"spam") 439 f.write(u"spam") 440 d = s.getvalue() 441 # check whether there is exactly one BOM in it 442 self.assertTrue(d == self.spamle or d == self.spambe) 443 # try to read it back 444 s = StringIO.StringIO(d) 445 f = reader(s) 446 self.assertEqual(f.read(), u"spamspam") 447 448 def test_badbom(self): 449 s = StringIO.StringIO("\xff\xff") 450 f = codecs.getreader(self.encoding)(s) 451 self.assertRaises(UnicodeError, f.read) 452 453 s = StringIO.StringIO("\xff\xff\xff\xff") 454 f = codecs.getreader(self.encoding)(s) 455 self.assertRaises(UnicodeError, f.read) 456 457 def test_partial(self): 458 self.check_partial( 459 u"\x00\xff\u0100\uffff\U00010000", 460 [ 461 u"", # first byte of BOM read 462 u"", # second byte of BOM read => byteorder known 463 u"", 464 u"\x00", 465 u"\x00", 466 u"\x00\xff", 467 u"\x00\xff", 468 u"\x00\xff\u0100", 469 u"\x00\xff\u0100", 470 u"\x00\xff\u0100\uffff", 471 u"\x00\xff\u0100\uffff", 472 u"\x00\xff\u0100\uffff", 473 u"\x00\xff\u0100\uffff", 474 u"\x00\xff\u0100\uffff\U00010000", 475 ] 476 ) 477 478 def test_handlers(self): 479 self.assertEqual((u'\ufffd', 1), 480 codecs.utf_16_decode('\x01', 'replace', True)) 481 self.assertEqual((u'', 1), 482 codecs.utf_16_decode('\x01', 'ignore', True)) 483 484 def test_errors(self): 485 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True) 486 487 def test_bug691291(self): 488 # Files are always opened in binary mode, even if no binary mode was 489 # specified. This means that no automatic conversion of '\n' is done 490 # on reading and writing. 491 s1 = u'Hello\r\nworld\r\n' 492 493 s = s1.encode(self.encoding) 494 self.addCleanup(test_support.unlink, test_support.TESTFN) 495 with open(test_support.TESTFN, 'wb') as fp: 496 fp.write(s) 497 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader: 498 self.assertEqual(reader.read(), s1) 499 500 class UTF16LETest(ReadTest): 501 encoding = "utf-16-le" 502 503 def test_partial(self): 504 self.check_partial( 505 u"\x00\xff\u0100\uffff\U00010000", 506 [ 507 u"", 508 u"\x00", 509 u"\x00", 510 u"\x00\xff", 511 u"\x00\xff", 512 u"\x00\xff\u0100", 513 u"\x00\xff\u0100", 514 u"\x00\xff\u0100\uffff", 515 u"\x00\xff\u0100\uffff", 516 u"\x00\xff\u0100\uffff", 517 u"\x00\xff\u0100\uffff", 518 u"\x00\xff\u0100\uffff\U00010000", 519 ] 520 ) 521 522 def test_errors(self): 523 tests = [ 524 (b'\xff', u'\ufffd'), 525 (b'A\x00Z', u'A\ufffd'), 526 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'), 527 (b'\x00\xd8', u'\ufffd'), 528 (b'\x00\xd8A', u'\ufffd'), 529 (b'\x00\xd8A\x00', u'\ufffdA'), 530 (b'\x00\xdcA\x00', u'\ufffdA'), 531 ] 532 for raw, expected in tests: 533 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, 534 raw, 'strict', True) 535 self.assertEqual(raw.decode('utf-16le', 'replace'), expected) 536 537 class UTF16BETest(ReadTest): 538 encoding = "utf-16-be" 539 540 def test_partial(self): 541 self.check_partial( 542 u"\x00\xff\u0100\uffff\U00010000", 543 [ 544 u"", 545 u"\x00", 546 u"\x00", 547 u"\x00\xff", 548 u"\x00\xff", 549 u"\x00\xff\u0100", 550 u"\x00\xff\u0100", 551 u"\x00\xff\u0100\uffff", 552 u"\x00\xff\u0100\uffff", 553 u"\x00\xff\u0100\uffff", 554 u"\x00\xff\u0100\uffff", 555 u"\x00\xff\u0100\uffff\U00010000", 556 ] 557 ) 558 559 def test_errors(self): 560 tests = [ 561 (b'\xff', u'\ufffd'), 562 (b'\x00A\xff', u'A\ufffd'), 563 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'), 564 (b'\xd8\x00', u'\ufffd'), 565 (b'\xd8\x00\xdc', u'\ufffd'), 566 (b'\xd8\x00\x00A', u'\ufffdA'), 567 (b'\xdc\x00\x00A', u'\ufffdA'), 568 ] 569 for raw, expected in tests: 570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, 571 raw, 'strict', True) 572 self.assertEqual(raw.decode('utf-16be', 'replace'), expected) 573 574 class UTF8Test(ReadTest): 575 encoding = "utf-8" 576 577 def test_partial(self): 578 self.check_partial( 579 u"\x00\xff\u07ff\u0800\uffff\U00010000", 580 [ 581 u"\x00", 582 u"\x00", 583 u"\x00\xff", 584 u"\x00\xff", 585 u"\x00\xff\u07ff", 586 u"\x00\xff\u07ff", 587 u"\x00\xff\u07ff", 588 u"\x00\xff\u07ff\u0800", 589 u"\x00\xff\u07ff\u0800", 590 u"\x00\xff\u07ff\u0800", 591 u"\x00\xff\u07ff\u0800\uffff", 592 u"\x00\xff\u07ff\u0800\uffff", 593 u"\x00\xff\u07ff\u0800\uffff", 594 u"\x00\xff\u07ff\u0800\uffff", 595 u"\x00\xff\u07ff\u0800\uffff\U00010000", 596 ] 597 ) 598 599 class UTF7Test(ReadTest): 600 encoding = "utf-7" 601 602 def test_partial(self): 603 self.check_partial( 604 u"a+-b", 605 [ 606 u"a", 607 u"a", 608 u"a+", 609 u"a+-", 610 u"a+-b", 611 ] 612 ) 613 614 class UTF16ExTest(unittest.TestCase): 615 616 def test_errors(self): 617 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True) 618 619 def test_bad_args(self): 620 self.assertRaises(TypeError, codecs.utf_16_ex_decode) 621 622 class ReadBufferTest(unittest.TestCase): 623 624 def test_array(self): 625 import array 626 self.assertEqual( 627 codecs.readbuffer_encode(array.array("c", "spam")), 628 ("spam", 4) 629 ) 630 631 def test_empty(self): 632 self.assertEqual(codecs.readbuffer_encode(""), ("", 0)) 633 634 def test_bad_args(self): 635 self.assertRaises(TypeError, codecs.readbuffer_encode) 636 self.assertRaises(TypeError, codecs.readbuffer_encode, 42) 637 638 class CharBufferTest(unittest.TestCase): 639 640 def test_string(self): 641 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4)) 642 643 def test_empty(self): 644 self.assertEqual(codecs.charbuffer_encode(""), ("", 0)) 645 646 def test_bad_args(self): 647 self.assertRaises(TypeError, codecs.charbuffer_encode) 648 self.assertRaises(TypeError, codecs.charbuffer_encode, 42) 649 650 class UTF8SigTest(ReadTest): 651 encoding = "utf-8-sig" 652 653 def test_partial(self): 654 self.check_partial( 655 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 656 [ 657 u"", 658 u"", 659 u"", # First BOM has been read and skipped 660 u"", 661 u"", 662 u"\ufeff", # Second BOM has been read and emitted 663 u"\ufeff\x00", # "\x00" read and emitted 664 u"\ufeff\x00", # First byte of encoded u"\xff" read 665 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read 666 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read 667 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read 668 u"\ufeff\x00\xff\u07ff", 669 u"\ufeff\x00\xff\u07ff", 670 u"\ufeff\x00\xff\u07ff\u0800", 671 u"\ufeff\x00\xff\u07ff\u0800", 672 u"\ufeff\x00\xff\u07ff\u0800", 673 u"\ufeff\x00\xff\u07ff\u0800\uffff", 674 u"\ufeff\x00\xff\u07ff\u0800\uffff", 675 u"\ufeff\x00\xff\u07ff\u0800\uffff", 676 u"\ufeff\x00\xff\u07ff\u0800\uffff", 677 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", 678 ] 679 ) 680 681 def test_bug1601501(self): 682 # SF bug #1601501: check that the codec works with a buffer 683 unicode("\xef\xbb\xbf", "utf-8-sig") 684 685 def test_bom(self): 686 d = codecs.getincrementaldecoder("utf-8-sig")() 687 s = u"spam" 688 self.assertEqual(d.decode(s.encode("utf-8-sig")), s) 689 690 def test_stream_bom(self): 691 unistring = u"ABC\u00A1\u2200XYZ" 692 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ" 693 694 reader = codecs.getreader("utf-8-sig") 695 for sizehint in [None] + range(1, 11) + \ 696 [64, 128, 256, 512, 1024]: 697 istream = reader(StringIO.StringIO(bytestring)) 698 ostream = StringIO.StringIO() 699 while 1: 700 if sizehint is not None: 701 data = istream.read(sizehint) 702 else: 703 data = istream.read() 704 705 if not data: 706 break 707 ostream.write(data) 708 709 got = ostream.getvalue() 710 self.assertEqual(got, unistring) 711 712 def test_stream_bare(self): 713 unistring = u"ABC\u00A1\u2200XYZ" 714 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ" 715 716 reader = codecs.getreader("utf-8-sig") 717 for sizehint in [None] + range(1, 11) + \ 718 [64, 128, 256, 512, 1024]: 719 istream = reader(StringIO.StringIO(bytestring)) 720 ostream = StringIO.StringIO() 721 while 1: 722 if sizehint is not None: 723 data = istream.read(sizehint) 724 else: 725 data = istream.read() 726 727 if not data: 728 break 729 ostream.write(data) 730 731 got = ostream.getvalue() 732 self.assertEqual(got, unistring) 733 734 class EscapeDecodeTest(unittest.TestCase): 735 def test_empty(self): 736 self.assertEqual(codecs.escape_decode(""), ("", 0)) 737 738 def test_raw(self): 739 decode = codecs.escape_decode 740 for b in range(256): 741 b = chr(b) 742 if b != '\\': 743 self.assertEqual(decode(b + '0'), (b + '0', 2)) 744 745 def test_escape(self): 746 decode = codecs.escape_decode 747 check = coding_checker(self, decode) 748 check(b"[\\\n]", b"[]") 749 check(br'[\"]', b'["]') 750 check(br"[\']", b"[']") 751 check(br"[\\]", br"[\]") 752 check(br"[\a]", b"[\x07]") 753 check(br"[\b]", b"[\x08]") 754 check(br"[\t]", b"[\x09]") 755 check(br"[\n]", b"[\x0a]") 756 check(br"[\v]", b"[\x0b]") 757 check(br"[\f]", b"[\x0c]") 758 check(br"[\r]", b"[\x0d]") 759 check(br"[\7]", b"[\x07]") 760 check(br"[\8]", br"[\8]") 761 check(br"[\78]", b"[\x078]") 762 check(br"[\41]", b"[!]") 763 check(br"[\418]", b"[!8]") 764 check(br"[\101]", b"[A]") 765 check(br"[\1010]", b"[A0]") 766 check(br"[\501]", b"[A]") 767 check(br"[\x41]", b"[A]") 768 check(br"[\X41]", br"[\X41]") 769 check(br"[\x410]", b"[A0]") 770 for b in range(256): 771 b = chr(b) 772 if b not in '\n"\'\\abtnvfr01234567x': 773 check('\\' + b, '\\' + b) 774 775 def test_errors(self): 776 decode = codecs.escape_decode 777 self.assertRaises(ValueError, decode, br"\x") 778 self.assertRaises(ValueError, decode, br"[\x]") 779 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6)) 780 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6)) 781 self.assertRaises(ValueError, decode, br"\x0") 782 self.assertRaises(ValueError, decode, br"[\x0]") 783 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) 784 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) 785 786 class RecodingTest(unittest.TestCase): 787 def test_recoding(self): 788 f = StringIO.StringIO() 789 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8") 790 f2.write(u"a") 791 f2.close() 792 # Python used to crash on this at exit because of a refcount 793 # bug in _codecsmodule.c 794 795 # From RFC 3492 796 punycode_testcases = [ 797 # A Arabic (Egyptian): 798 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 799 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F", 800 "egbpdaj6bu4bxfgehfvwxn"), 801 # B Chinese (simplified): 802 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587", 803 "ihqwcrb4cv8a8dqg056pqjye"), 804 # C Chinese (traditional): 805 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587", 806 "ihqwctvzc91f659drss3x8bo0yb"), 807 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 808 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074" 809 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D" 810 u"\u0065\u0073\u006B\u0079", 811 "Proprostnemluvesky-uyb24dma41a"), 812 # E Hebrew: 813 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8" 814 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2" 815 u"\u05D1\u05E8\u05D9\u05EA", 816 "4dbcagdahymbxekheh6e0a7fei0b"), 817 # F Hindi (Devanagari): 818 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D" 819 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939" 820 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947" 821 u"\u0939\u0948\u0902", 822 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"), 823 824 #(G) Japanese (kanji and hiragana): 825 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092" 826 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B", 827 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"), 828 829 # (H) Korean (Hangul syllables): 830 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774" 831 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74" 832 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C", 833 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j" 834 "psd879ccm6fea98c"), 835 836 # (I) Russian (Cyrillic): 837 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E" 838 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440" 839 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A" 840 u"\u0438", 841 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"), 842 843 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 844 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070" 845 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070" 846 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061" 847 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070" 848 u"\u0061\u00F1\u006F\u006C", 849 "PorqunopuedensimplementehablarenEspaol-fmd56a"), 850 851 # (K) Vietnamese: 852 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 853 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 854 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B" 855 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068" 856 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067" 857 u"\u0056\u0069\u1EC7\u0074", 858 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"), 859 860 #(L) 3<nen>B<gumi><kinpachi><sensei> 861 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F", 862 "3B-ww4c5e180e575a65lsy2b"), 863 864 # (M) <amuro><namie>-with-SUPER-MONKEYS 865 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074" 866 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D" 867 u"\u004F\u004E\u004B\u0045\u0059\u0053", 868 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"), 869 870 # (N) Hello-Another-Way-<sorezore><no><basho> 871 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F" 872 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D" 873 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240", 874 "Hello-Another-Way--fc4qua05auwb3674vfr0b"), 875 876 # (O) <hitotsu><yane><no><shita>2 877 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032", 878 "2-u9tlzr9756bt3uc0v"), 879 880 # (P) Maji<de>Koi<suru>5<byou><mae> 881 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059" 882 u"\u308B\u0035\u79D2\u524D", 883 "MajiKoi5-783gue6qz075azm5e"), 884 885 # (Q) <pafii>de<runba> 886 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0", 887 "de-jg4avhby1noc0d"), 888 889 # (R) <sono><supiido><de> 890 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067", 891 "d9juau41awczczp"), 892 893 # (S) -> $1.00 <- 894 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020" 895 u"\u003C\u002D", 896 "-> $1.00 <--") 897 ] 898 899 for i in punycode_testcases: 900 if len(i)!=2: 901 print repr(i) 902 903 class PunycodeTest(unittest.TestCase): 904 def test_encode(self): 905 for uni, puny in punycode_testcases: 906 # Need to convert both strings to lower case, since 907 # some of the extended encodings use upper case, but our 908 # code produces only lower case. Converting just puny to 909 # lower is also insufficient, since some of the input characters 910 # are upper case. 911 self.assertEqual(uni.encode("punycode").lower(), puny.lower()) 912 913 def test_decode(self): 914 for uni, puny in punycode_testcases: 915 self.assertEqual(uni, puny.decode("punycode")) 916 917 class UnicodeInternalTest(unittest.TestCase): 918 def test_bug1251300(self): 919 # Decoding with unicode_internal used to not correctly handle "code 920 # points" above 0x10ffff on UCS-4 builds. 921 if sys.maxunicode > 0xffff: 922 ok = [ 923 ("\x00\x10\xff\xff", u"\U0010ffff"), 924 ("\x00\x00\x01\x01", u"\U00000101"), 925 ("", u""), 926 ] 927 not_ok = [ 928 "\x7f\xff\xff\xff", 929 "\x80\x00\x00\x00", 930 "\x81\x00\x00\x00", 931 "\x00", 932 "\x00\x00\x00\x00\x00", 933 ] 934 for internal, uni in ok: 935 if sys.byteorder == "little": 936 internal = "".join(reversed(internal)) 937 self.assertEqual(uni, internal.decode("unicode_internal")) 938 for internal in not_ok: 939 if sys.byteorder == "little": 940 internal = "".join(reversed(internal)) 941 self.assertRaises(UnicodeDecodeError, internal.decode, 942 "unicode_internal") 943 944 def test_decode_error_attributes(self): 945 if sys.maxunicode > 0xffff: 946 try: 947 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") 948 except UnicodeDecodeError, ex: 949 self.assertEqual("unicode_internal", ex.encoding) 950 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) 951 self.assertEqual(4, ex.start) 952 self.assertEqual(8, ex.end) 953 else: 954 self.fail() 955 956 def test_decode_callback(self): 957 if sys.maxunicode > 0xffff: 958 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) 959 decoder = codecs.getdecoder("unicode_internal") 960 ab = u"ab".encode("unicode_internal") 961 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), 962 "UnicodeInternalTest") 963 self.assertEqual((u"ab", 12), ignored) 964 965 def test_encode_length(self): 966 # Issue 3739 967 encoder = codecs.getencoder("unicode_internal") 968 self.assertEqual(encoder(u"a")[1], 1) 969 self.assertEqual(encoder(u"\xe9\u0142")[1], 2) 970 971 encoder = codecs.getencoder("string-escape") 972 self.assertEqual(encoder(r'\x00')[1], 4) 973 974 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html 975 nameprep_tests = [ 976 # 3.1 Map to nothing. 977 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar' 978 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef' 979 '\xb8\x8f\xef\xbb\xbf', 980 'foobarbaz'), 981 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045. 982 ('CAFE', 983 'cafe'), 984 # 3.3 Case folding 8bit U+00DF (german sharp s). 985 # The original test case is bogus; it says \xc3\xdf 986 ('\xc3\x9f', 987 'ss'), 988 # 3.4 Case folding U+0130 (turkish capital I with dot). 989 ('\xc4\xb0', 990 'i\xcc\x87'), 991 # 3.5 Case folding multibyte U+0143 U+037A. 992 ('\xc5\x83\xcd\xba', 993 '\xc5\x84 \xce\xb9'), 994 # 3.6 Case folding U+2121 U+33C6 U+1D7BB. 995 # XXX: skip this as it fails in UCS-2 mode 996 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb', 997 # 'telc\xe2\x88\x95kg\xcf\x83'), 998 (None, None), 999 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA. 1000 ('j\xcc\x8c\xc2\xa0\xc2\xaa', 1001 '\xc7\xb0 a'), 1002 # 3.8 Case folding U+1FB7 and normalization. 1003 ('\xe1\xbe\xb7', 1004 '\xe1\xbe\xb6\xce\xb9'), 1005 # 3.9 Self-reverting case folding U+01F0 and normalization. 1006 # The original test case is bogus, it says `\xc7\xf0' 1007 ('\xc7\xb0', 1008 '\xc7\xb0'), 1009 # 3.10 Self-reverting case folding U+0390 and normalization. 1010 ('\xce\x90', 1011 '\xce\x90'), 1012 # 3.11 Self-reverting case folding U+03B0 and normalization. 1013 ('\xce\xb0', 1014 '\xce\xb0'), 1015 # 3.12 Self-reverting case folding U+1E96 and normalization. 1016 ('\xe1\xba\x96', 1017 '\xe1\xba\x96'), 1018 # 3.13 Self-reverting case folding U+1F56 and normalization. 1019 ('\xe1\xbd\x96', 1020 '\xe1\xbd\x96'), 1021 # 3.14 ASCII space character U+0020. 1022 (' ', 1023 ' '), 1024 # 3.15 Non-ASCII 8bit space character U+00A0. 1025 ('\xc2\xa0', 1026 ' '), 1027 # 3.16 Non-ASCII multibyte space character U+1680. 1028 ('\xe1\x9a\x80', 1029 None), 1030 # 3.17 Non-ASCII multibyte space character U+2000. 1031 ('\xe2\x80\x80', 1032 ' '), 1033 # 3.18 Zero Width Space U+200b. 1034 ('\xe2\x80\x8b', 1035 ''), 1036 # 3.19 Non-ASCII multibyte space character U+3000. 1037 ('\xe3\x80\x80', 1038 ' '), 1039 # 3.20 ASCII control characters U+0010 U+007F. 1040 ('\x10\x7f', 1041 '\x10\x7f'), 1042 # 3.21 Non-ASCII 8bit control character U+0085. 1043 ('\xc2\x85', 1044 None), 1045 # 3.22 Non-ASCII multibyte control character U+180E. 1046 ('\xe1\xa0\x8e', 1047 None), 1048 # 3.23 Zero Width No-Break Space U+FEFF. 1049 ('\xef\xbb\xbf', 1050 ''), 1051 # 3.24 Non-ASCII control character U+1D175. 1052 ('\xf0\x9d\x85\xb5', 1053 None), 1054 # 3.25 Plane 0 private use character U+F123. 1055 ('\xef\x84\xa3', 1056 None), 1057 # 3.26 Plane 15 private use character U+F1234. 1058 ('\xf3\xb1\x88\xb4', 1059 None), 1060 # 3.27 Plane 16 private use character U+10F234. 1061 ('\xf4\x8f\x88\xb4', 1062 None), 1063 # 3.28 Non-character code point U+8FFFE. 1064 ('\xf2\x8f\xbf\xbe', 1065 None), 1066 # 3.29 Non-character code point U+10FFFF. 1067 ('\xf4\x8f\xbf\xbf', 1068 None), 1069 # 3.30 Surrogate code U+DF42. 1070 ('\xed\xbd\x82', 1071 None), 1072 # 3.31 Non-plain text character U+FFFD. 1073 ('\xef\xbf\xbd', 1074 None), 1075 # 3.32 Ideographic description character U+2FF5. 1076 ('\xe2\xbf\xb5', 1077 None), 1078 # 3.33 Display property character U+0341. 1079 ('\xcd\x81', 1080 '\xcc\x81'), 1081 # 3.34 Left-to-right mark U+200E. 1082 ('\xe2\x80\x8e', 1083 None), 1084 # 3.35 Deprecated U+202A. 1085 ('\xe2\x80\xaa', 1086 None), 1087 # 3.36 Language tagging character U+E0001. 1088 ('\xf3\xa0\x80\x81', 1089 None), 1090 # 3.37 Language tagging character U+E0042. 1091 ('\xf3\xa0\x81\x82', 1092 None), 1093 # 3.38 Bidi: RandALCat character U+05BE and LCat characters. 1094 ('foo\xd6\xbebar', 1095 None), 1096 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters. 1097 ('foo\xef\xb5\x90bar', 1098 None), 1099 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters. 1100 ('foo\xef\xb9\xb6bar', 1101 'foo \xd9\x8ebar'), 1102 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031. 1103 ('\xd8\xa71', 1104 None), 1105 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628. 1106 ('\xd8\xa71\xd8\xa8', 1107 '\xd8\xa71\xd8\xa8'), 1108 # 3.43 Unassigned code point U+E0002. 1109 # Skip this test as we allow unassigned 1110 #('\xf3\xa0\x80\x82', 1111 # None), 1112 (None, None), 1113 # 3.44 Larger test (shrinking). 1114 # Original test case reads \xc3\xdf 1115 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2' 1116 '\xaa\xce\xb0\xe2\x80\x80', 1117 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '), 1118 # 3.45 Larger test (expanding). 1119 # Original test case reads \xc3\x9f 1120 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c' 1121 '\x80', 1122 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3' 1123 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82' 1124 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88') 1125 ] 1126 1127 1128 class NameprepTest(unittest.TestCase): 1129 def test_nameprep(self): 1130 from encodings.idna import nameprep 1131 for pos, (orig, prepped) in enumerate(nameprep_tests): 1132 if orig is None: 1133 # Skipped 1134 continue 1135 # The Unicode strings are given in UTF-8 1136 orig = unicode(orig, "utf-8") 1137 if prepped is None: 1138 # Input contains prohibited characters 1139 self.assertRaises(UnicodeError, nameprep, orig) 1140 else: 1141 prepped = unicode(prepped, "utf-8") 1142 try: 1143 self.assertEqual(nameprep(orig), prepped) 1144 except Exception,e: 1145 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) 1146 1147 class IDNACodecTest(unittest.TestCase): 1148 def test_builtin_decode(self): 1149 self.assertEqual(unicode("python.org", "idna"), u"python.org") 1150 self.assertEqual(unicode("python.org.", "idna"), u"python.org.") 1151 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org") 1152 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.") 1153 1154 def test_builtin_encode(self): 1155 self.assertEqual(u"python.org".encode("idna"), "python.org") 1156 self.assertEqual("python.org.".encode("idna"), "python.org.") 1157 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org") 1158 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.") 1159 1160 def test_stream(self): 1161 import StringIO 1162 r = codecs.getreader("idna")(StringIO.StringIO("abc")) 1163 r.read(3) 1164 self.assertEqual(r.read(), u"") 1165 1166 def test_incremental_decode(self): 1167 self.assertEqual( 1168 "".join(codecs.iterdecode("python.org", "idna")), 1169 u"python.org" 1170 ) 1171 self.assertEqual( 1172 "".join(codecs.iterdecode("python.org.", "idna")), 1173 u"python.org." 1174 ) 1175 self.assertEqual( 1176 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), 1177 u"pyth\xf6n.org." 1178 ) 1179 self.assertEqual( 1180 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), 1181 u"pyth\xf6n.org." 1182 ) 1183 1184 decoder = codecs.getincrementaldecoder("idna")() 1185 self.assertEqual(decoder.decode("xn--xam", ), u"") 1186 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.") 1187 self.assertEqual(decoder.decode(u"rg"), u"") 1188 self.assertEqual(decoder.decode(u"", True), u"org") 1189 1190 decoder.reset() 1191 self.assertEqual(decoder.decode("xn--xam", ), u"") 1192 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.") 1193 self.assertEqual(decoder.decode("rg."), u"org.") 1194 self.assertEqual(decoder.decode("", True), u"") 1195 1196 def test_incremental_encode(self): 1197 self.assertEqual( 1198 "".join(codecs.iterencode(u"python.org", "idna")), 1199 "python.org" 1200 ) 1201 self.assertEqual( 1202 "".join(codecs.iterencode(u"python.org.", "idna")), 1203 "python.org." 1204 ) 1205 self.assertEqual( 1206 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), 1207 "xn--pythn-mua.org." 1208 ) 1209 self.assertEqual( 1210 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")), 1211 "xn--pythn-mua.org." 1212 ) 1213 1214 encoder = codecs.getincrementalencoder("idna")() 1215 self.assertEqual(encoder.encode(u"\xe4x"), "") 1216 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.") 1217 self.assertEqual(encoder.encode(u"", True), "org") 1218 1219 encoder.reset() 1220 self.assertEqual(encoder.encode(u"\xe4x"), "") 1221 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.") 1222 self.assertEqual(encoder.encode(u"", True), "") 1223 1224 class CodecsModuleTest(unittest.TestCase): 1225 1226 def test_decode(self): 1227 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'), 1228 u'\xe4\xf6\xfc') 1229 self.assertRaises(TypeError, codecs.decode) 1230 self.assertEqual(codecs.decode('abc'), u'abc') 1231 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii') 1232 1233 def test_encode(self): 1234 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'), 1235 '\xe4\xf6\xfc') 1236 self.assertRaises(TypeError, codecs.encode) 1237 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__") 1238 self.assertEqual(codecs.encode(u'abc'), 'abc') 1239 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii') 1240 1241 def test_register(self): 1242 self.assertRaises(TypeError, codecs.register) 1243 self.assertRaises(TypeError, codecs.register, 42) 1244 1245 def test_lookup(self): 1246 self.assertRaises(TypeError, codecs.lookup) 1247 self.assertRaises(LookupError, codecs.lookup, "__spam__") 1248 self.assertRaises(LookupError, codecs.lookup, " ") 1249 1250 def test_getencoder(self): 1251 self.assertRaises(TypeError, codecs.getencoder) 1252 self.assertRaises(LookupError, codecs.getencoder, "__spam__") 1253 1254 def test_getdecoder(self): 1255 self.assertRaises(TypeError, codecs.getdecoder) 1256 self.assertRaises(LookupError, codecs.getdecoder, "__spam__") 1257 1258 def test_getreader(self): 1259 self.assertRaises(TypeError, codecs.getreader) 1260 self.assertRaises(LookupError, codecs.getreader, "__spam__") 1261 1262 def test_getwriter(self): 1263 self.assertRaises(TypeError, codecs.getwriter) 1264 self.assertRaises(LookupError, codecs.getwriter, "__spam__") 1265 1266 def test_lookup_issue1813(self): 1267 # Issue #1813: under Turkish locales, lookup of some codecs failed 1268 # because 'I' is lowercased as a dotless "i" 1269 oldlocale = locale.getlocale(locale.LC_CTYPE) 1270 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1271 try: 1272 locale.setlocale(locale.LC_CTYPE, 'tr_TR') 1273 except locale.Error: 1274 # Unsupported locale on this system 1275 self.skipTest('test needs Turkish locale') 1276 c = codecs.lookup('ASCII') 1277 self.assertEqual(c.name, 'ascii') 1278 1279 class StreamReaderTest(unittest.TestCase): 1280 1281 def setUp(self): 1282 self.reader = codecs.getreader('utf-8') 1283 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') 1284 1285 def test_readlines(self): 1286 f = self.reader(self.stream) 1287 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00']) 1288 1289 class EncodedFileTest(unittest.TestCase): 1290 1291 def test_basic(self): 1292 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80') 1293 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8') 1294 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae') 1295 1296 f = StringIO.StringIO() 1297 ef = codecs.EncodedFile(f, 'utf-8', 'latin1') 1298 ef.write('\xc3\xbc') 1299 self.assertEqual(f.getvalue(), '\xfc') 1300 1301 class Str2StrTest(unittest.TestCase): 1302 1303 def test_read(self): 1304 sin = "\x80".encode("base64_codec") 1305 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) 1306 sout = reader.read() 1307 self.assertEqual(sout, "\x80") 1308 self.assertIsInstance(sout, str) 1309 1310 def test_readline(self): 1311 sin = "\x80".encode("base64_codec") 1312 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) 1313 sout = reader.readline() 1314 self.assertEqual(sout, "\x80") 1315 self.assertIsInstance(sout, str) 1316 1317 all_unicode_encodings = [ 1318 "ascii", 1319 "base64_codec", 1320 "big5", 1321 "big5hkscs", 1322 "charmap", 1323 "cp037", 1324 "cp1006", 1325 "cp1026", 1326 "cp1140", 1327 "cp1250", 1328 "cp1251", 1329 "cp1252", 1330 "cp1253", 1331 "cp1254", 1332 "cp1255", 1333 "cp1256", 1334 "cp1257", 1335 "cp1258", 1336 "cp424", 1337 "cp437", 1338 "cp500", 1339 "cp720", 1340 "cp737", 1341 "cp775", 1342 "cp850", 1343 "cp852", 1344 "cp855", 1345 "cp856", 1346 "cp857", 1347 "cp858", 1348 "cp860", 1349 "cp861", 1350 "cp862", 1351 "cp863", 1352 "cp864", 1353 "cp865", 1354 "cp866", 1355 "cp869", 1356 "cp874", 1357 "cp875", 1358 "cp932", 1359 "cp949", 1360 "cp950", 1361 "euc_jis_2004", 1362 "euc_jisx0213", 1363 "euc_jp", 1364 "euc_kr", 1365 "gb18030", 1366 "gb2312", 1367 "gbk", 1368 "hex_codec", 1369 "hp_roman8", 1370 "hz", 1371 "idna", 1372 "iso2022_jp", 1373 "iso2022_jp_1", 1374 "iso2022_jp_2", 1375 "iso2022_jp_2004", 1376 "iso2022_jp_3", 1377 "iso2022_jp_ext", 1378 "iso2022_kr", 1379 "iso8859_1", 1380 "iso8859_10", 1381 "iso8859_11", 1382 "iso8859_13", 1383 "iso8859_14", 1384 "iso8859_15", 1385 "iso8859_16", 1386 "iso8859_2", 1387 "iso8859_3", 1388 "iso8859_4", 1389 "iso8859_5", 1390 "iso8859_6", 1391 "iso8859_7", 1392 "iso8859_8", 1393 "iso8859_9", 1394 "johab", 1395 "koi8_r", 1396 "koi8_u", 1397 "latin_1", 1398 "mac_cyrillic", 1399 "mac_greek", 1400 "mac_iceland", 1401 "mac_latin2", 1402 "mac_roman", 1403 "mac_turkish", 1404 "palmos", 1405 "ptcp154", 1406 "punycode", 1407 "raw_unicode_escape", 1408 "rot_13", 1409 "shift_jis", 1410 "shift_jis_2004", 1411 "shift_jisx0213", 1412 "tis_620", 1413 "unicode_escape", 1414 "unicode_internal", 1415 "utf_16", 1416 "utf_16_be", 1417 "utf_16_le", 1418 "utf_7", 1419 "utf_8", 1420 ] 1421 1422 if hasattr(codecs, "mbcs_encode"): 1423 all_unicode_encodings.append("mbcs") 1424 1425 # The following encodings work only with str, not unicode 1426 all_string_encodings = [ 1427 "quopri_codec", 1428 "string_escape", 1429 "uu_codec", 1430 ] 1431 1432 # The following encoding is not tested, because it's not supposed 1433 # to work: 1434 # "undefined" 1435 1436 # The following encodings don't work in stateful mode 1437 broken_unicode_with_streams = [ 1438 "base64_codec", 1439 "hex_codec", 1440 "punycode", 1441 "unicode_internal" 1442 ] 1443 broken_incremental_coders = broken_unicode_with_streams[:] 1444 1445 # The following encodings only support "strict" mode 1446 only_strict_mode = [ 1447 "idna", 1448 "zlib_codec", 1449 "bz2_codec", 1450 ] 1451 1452 try: 1453 import bz2 1454 except ImportError: 1455 pass 1456 else: 1457 all_unicode_encodings.append("bz2_codec") 1458 broken_unicode_with_streams.append("bz2_codec") 1459 1460 try: 1461 import zlib 1462 except ImportError: 1463 pass 1464 else: 1465 all_unicode_encodings.append("zlib_codec") 1466 broken_unicode_with_streams.append("zlib_codec") 1467 1468 class BasicUnicodeTest(unittest.TestCase): 1469 def test_basics(self): 1470 s = u"abc123" # all codecs should be able to encode these 1471 for encoding in all_unicode_encodings: 1472 name = codecs.lookup(encoding).name 1473 if encoding.endswith("_codec"): 1474 name += "_codec" 1475 elif encoding == "latin_1": 1476 name = "latin_1" 1477 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-")) 1478 (bytes, size) = codecs.getencoder(encoding)(s) 1479 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding)) 1480 (chars, size) = codecs.getdecoder(encoding)(bytes) 1481 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) 1482 1483 if encoding not in broken_unicode_with_streams: 1484 # check stream reader/writer 1485 q = Queue() 1486 writer = codecs.getwriter(encoding)(q) 1487 encodedresult = "" 1488 for c in s: 1489 writer.write(c) 1490 encodedresult += q.read() 1491 q = Queue() 1492 reader = codecs.getreader(encoding)(q) 1493 decodedresult = u"" 1494 for c in encodedresult: 1495 q.write(c) 1496 decodedresult += reader.read() 1497 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) 1498 1499 if encoding not in broken_incremental_coders: 1500 # check incremental decoder/encoder (fetched via the Python 1501 # and C API) and iterencode()/iterdecode() 1502 try: 1503 encoder = codecs.getincrementalencoder(encoding)() 1504 cencoder = _testcapi.codec_incrementalencoder(encoding) 1505 except LookupError: # no IncrementalEncoder 1506 pass 1507 else: 1508 # check incremental decoder/encoder 1509 encodedresult = "" 1510 for c in s: 1511 encodedresult += encoder.encode(c) 1512 encodedresult += encoder.encode(u"", True) 1513 decoder = codecs.getincrementaldecoder(encoding)() 1514 decodedresult = u"" 1515 for c in encodedresult: 1516 decodedresult += decoder.decode(c) 1517 decodedresult += decoder.decode("", True) 1518 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) 1519 1520 # check C API 1521 encodedresult = "" 1522 for c in s: 1523 encodedresult += cencoder.encode(c) 1524 encodedresult += cencoder.encode(u"", True) 1525 cdecoder = _testcapi.codec_incrementaldecoder(encoding) 1526 decodedresult = u"" 1527 for c in encodedresult: 1528 decodedresult += cdecoder.decode(c) 1529 decodedresult += cdecoder.decode("", True) 1530 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) 1531 1532 # check iterencode()/iterdecode() 1533 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding)) 1534 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding)) 1535 1536 # check iterencode()/iterdecode() with empty string 1537 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding)) 1538 self.assertEqual(result, u"") 1539 1540 if encoding not in only_strict_mode: 1541 # check incremental decoder/encoder with errors argument 1542 try: 1543 encoder = codecs.getincrementalencoder(encoding)("ignore") 1544 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") 1545 except LookupError: # no IncrementalEncoder 1546 pass 1547 else: 1548 encodedresult = "".join(encoder.encode(c) for c in s) 1549 decoder = codecs.getincrementaldecoder(encoding)("ignore") 1550 decodedresult = u"".join(decoder.decode(c) for c in encodedresult) 1551 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) 1552 1553 encodedresult = "".join(cencoder.encode(c) for c in s) 1554 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") 1555 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult) 1556 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding)) 1557 1558 def test_seek(self): 1559 # all codecs should be able to encode these 1560 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456") 1561 for encoding in all_unicode_encodings: 1562 if encoding == "idna": # FIXME: See SF bug #1163178 1563 continue 1564 if encoding in broken_unicode_with_streams: 1565 continue 1566 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding))) 1567 for t in xrange(5): 1568 # Test that calling seek resets the internal codec state and buffers 1569 reader.seek(0, 0) 1570 line = reader.readline() 1571 self.assertEqual(s[:len(line)], line) 1572 1573 def test_bad_decode_args(self): 1574 for encoding in all_unicode_encodings: 1575 decoder = codecs.getdecoder(encoding) 1576 self.assertRaises(TypeError, decoder) 1577 if encoding not in ("idna", "punycode"): 1578 self.assertRaises(TypeError, decoder, 42) 1579 1580 def test_bad_encode_args(self): 1581 for encoding in all_unicode_encodings: 1582 encoder = codecs.getencoder(encoding) 1583 self.assertRaises(TypeError, encoder) 1584 1585 def test_encoding_map_type_initialized(self): 1586 from encodings import cp1140 1587 # This used to crash, we are only verifying there's no crash. 1588 table_type = type(cp1140.encoding_table) 1589 self.assertEqual(table_type, table_type) 1590 1591 class BasicStrTest(unittest.TestCase): 1592 def test_basics(self): 1593 s = "abc123" 1594 for encoding in all_string_encodings: 1595 (bytes, size) = codecs.getencoder(encoding)(s) 1596 self.assertEqual(size, len(s)) 1597 (chars, size) = codecs.getdecoder(encoding)(bytes) 1598 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding)) 1599 1600 class CharmapTest(unittest.TestCase): 1601 def test_decode_with_string_map(self): 1602 self.assertEqual( 1603 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"), 1604 (u"abc", 3) 1605 ) 1606 1607 self.assertRaises(UnicodeDecodeError, 1608 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab" 1609 ) 1610 1611 self.assertRaises(UnicodeDecodeError, 1612 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe" 1613 ) 1614 1615 self.assertEqual( 1616 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), 1617 (u"ab\ufffd", 3) 1618 ) 1619 1620 self.assertEqual( 1621 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"), 1622 (u"ab\ufffd", 3) 1623 ) 1624 1625 self.assertEqual( 1626 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"), 1627 (u"ab", 3) 1628 ) 1629 1630 self.assertEqual( 1631 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), 1632 (u"ab", 3) 1633 ) 1634 1635 allbytes = "".join(chr(i) for i in xrange(256)) 1636 self.assertEqual( 1637 codecs.charmap_decode(allbytes, "ignore", u""), 1638 (u"", len(allbytes)) 1639 ) 1640 1641 def test_decode_with_int2str_map(self): 1642 self.assertEqual( 1643 codecs.charmap_decode("\x00\x01\x02", "strict", 1644 {0: u'a', 1: u'b', 2: u'c'}), 1645 (u"abc", 3) 1646 ) 1647 1648 self.assertEqual( 1649 codecs.charmap_decode("\x00\x01\x02", "strict", 1650 {0: u'Aa', 1: u'Bb', 2: u'Cc'}), 1651 (u"AaBbCc", 3) 1652 ) 1653 1654 self.assertEqual( 1655 codecs.charmap_decode("\x00\x01\x02", "strict", 1656 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}), 1657 (u"\U0010FFFFbc", 3) 1658 ) 1659 1660 self.assertEqual( 1661 codecs.charmap_decode("\x00\x01\x02", "strict", 1662 {0: u'a', 1: u'b', 2: u''}), 1663 (u"ab", 3) 1664 ) 1665 1666 self.assertRaises(UnicodeDecodeError, 1667 codecs.charmap_decode, "\x00\x01\x02", "strict", 1668 {0: u'a', 1: u'b'} 1669 ) 1670 1671 self.assertRaises(UnicodeDecodeError, 1672 codecs.charmap_decode, "\x00\x01\x02", "strict", 1673 {0: u'a', 1: u'b', 2: None} 1674 ) 1675 1676 # Issue #14850 1677 self.assertRaises(UnicodeDecodeError, 1678 codecs.charmap_decode, "\x00\x01\x02", "strict", 1679 {0: u'a', 1: u'b', 2: u'\ufffe'} 1680 ) 1681 1682 self.assertEqual( 1683 codecs.charmap_decode("\x00\x01\x02", "replace", 1684 {0: u'a', 1: u'b'}), 1685 (u"ab\ufffd", 3) 1686 ) 1687 1688 self.assertEqual( 1689 codecs.charmap_decode("\x00\x01\x02", "replace", 1690 {0: u'a', 1: u'b', 2: None}), 1691 (u"ab\ufffd", 3) 1692 ) 1693 1694 # Issue #14850 1695 self.assertEqual( 1696 codecs.charmap_decode("\x00\x01\x02", "replace", 1697 {0: u'a', 1: u'b', 2: u'\ufffe'}), 1698 (u"ab\ufffd", 3) 1699 ) 1700 1701 self.assertEqual( 1702 codecs.charmap_decode("\x00\x01\x02", "ignore", 1703 {0: u'a', 1: u'b'}), 1704 (u"ab", 3) 1705 ) 1706 1707 self.assertEqual( 1708 codecs.charmap_decode("\x00\x01\x02", "ignore", 1709 {0: u'a', 1: u'b', 2: None}), 1710 (u"ab", 3) 1711 ) 1712 1713 # Issue #14850 1714 self.assertEqual( 1715 codecs.charmap_decode("\x00\x01\x02", "ignore", 1716 {0: u'a', 1: u'b', 2: u'\ufffe'}), 1717 (u"ab", 3) 1718 ) 1719 1720 allbytes = "".join(chr(i) for i in xrange(256)) 1721 self.assertEqual( 1722 codecs.charmap_decode(allbytes, "ignore", {}), 1723 (u"", len(allbytes)) 1724 ) 1725 1726 def test_decode_with_int2int_map(self): 1727 a = ord(u'a') 1728 b = ord(u'b') 1729 c = ord(u'c') 1730 1731 self.assertEqual( 1732 codecs.charmap_decode("\x00\x01\x02", "strict", 1733 {0: a, 1: b, 2: c}), 1734 (u"abc", 3) 1735 ) 1736 1737 # Issue #15379 1738 self.assertEqual( 1739 codecs.charmap_decode("\x00\x01\x02", "strict", 1740 {0: 0x10FFFF, 1: b, 2: c}), 1741 (u"\U0010FFFFbc", 3) 1742 ) 1743 1744 self.assertRaises(TypeError, 1745 codecs.charmap_decode, "\x00\x01\x02", "strict", 1746 {0: 0x110000, 1: b, 2: c} 1747 ) 1748 1749 self.assertRaises(UnicodeDecodeError, 1750 codecs.charmap_decode, "\x00\x01\x02", "strict", 1751 {0: a, 1: b}, 1752 ) 1753 1754 self.assertRaises(UnicodeDecodeError, 1755 codecs.charmap_decode, "\x00\x01\x02", "strict", 1756 {0: a, 1: b, 2: 0xFFFE}, 1757 ) 1758 1759 self.assertEqual( 1760 codecs.charmap_decode("\x00\x01\x02", "replace", 1761 {0: a, 1: b}), 1762 (u"ab\ufffd", 3) 1763 ) 1764 1765 self.assertEqual( 1766 codecs.charmap_decode("\x00\x01\x02", "replace", 1767 {0: a, 1: b, 2: 0xFFFE}), 1768 (u"ab\ufffd", 3) 1769 ) 1770 1771 self.assertEqual( 1772 codecs.charmap_decode("\x00\x01\x02", "ignore", 1773 {0: a, 1: b}), 1774 (u"ab", 3) 1775 ) 1776 1777 self.assertEqual( 1778 codecs.charmap_decode("\x00\x01\x02", "ignore", 1779 {0: a, 1: b, 2: 0xFFFE}), 1780 (u"ab", 3) 1781 ) 1782 1783 1784 class WithStmtTest(unittest.TestCase): 1785 def test_encodedfile(self): 1786 f = StringIO.StringIO("\xc3\xbc") 1787 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef: 1788 self.assertEqual(ef.read(), "\xfc") 1789 1790 def test_streamreaderwriter(self): 1791 f = StringIO.StringIO("\xc3\xbc") 1792 info = codecs.lookup("utf-8") 1793 with codecs.StreamReaderWriter(f, info.streamreader, 1794 info.streamwriter, 'strict') as srw: 1795 self.assertEqual(srw.read(), u"\xfc") 1796 1797 1798 class UnicodeEscapeTest(unittest.TestCase): 1799 def test_empty(self): 1800 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0)) 1801 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0)) 1802 1803 def test_raw_encode(self): 1804 encode = codecs.unicode_escape_encode 1805 for b in range(32, 127): 1806 if b != ord('\\'): 1807 self.assertEqual(encode(unichr(b)), (chr(b), 1)) 1808 1809 def test_raw_decode(self): 1810 decode = codecs.unicode_escape_decode 1811 for b in range(256): 1812 if b != ord('\\'): 1813 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2)) 1814 1815 def test_escape_encode(self): 1816 encode = codecs.unicode_escape_encode 1817 check = coding_checker(self, encode) 1818 check(u'\t', r'\t') 1819 check(u'\n', r'\n') 1820 check(u'\r', r'\r') 1821 check(u'\\', r'\\') 1822 for b in range(32): 1823 if chr(b) not in '\t\n\r': 1824 check(unichr(b), '\\x%02x' % b) 1825 for b in range(127, 256): 1826 check(unichr(b), '\\x%02x' % b) 1827 check(u'\u20ac', r'\u20ac') 1828 check(u'\U0001d120', r'\U0001d120') 1829 1830 def test_escape_decode(self): 1831 decode = codecs.unicode_escape_decode 1832 check = coding_checker(self, decode) 1833 check("[\\\n]", u"[]") 1834 check(r'[\"]', u'["]') 1835 check(r"[\']", u"[']") 1836 check(r"[\\]", ur"[\]") 1837 check(r"[\a]", u"[\x07]") 1838 check(r"[\b]", u"[\x08]") 1839 check(r"[\t]", u"[\x09]") 1840 check(r"[\n]", u"[\x0a]") 1841 check(r"[\v]", u"[\x0b]") 1842 check(r"[\f]", u"[\x0c]") 1843 check(r"[\r]", u"[\x0d]") 1844 check(r"[\7]", u"[\x07]") 1845 check(r"[\8]", ur"[\8]") 1846 check(r"[\78]", u"[\x078]") 1847 check(r"[\41]", u"[!]") 1848 check(r"[\418]", u"[!8]") 1849 check(r"[\101]", u"[A]") 1850 check(r"[\1010]", u"[A0]") 1851 check(r"[\x41]", u"[A]") 1852 check(r"[\x410]", u"[A0]") 1853 check(r"\u20ac", u"\u20ac") 1854 check(r"\U0001d120", u"\U0001d120") 1855 for b in range(256): 1856 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN': 1857 check('\\' + chr(b), u'\\' + unichr(b)) 1858 1859 def test_decode_errors(self): 1860 decode = codecs.unicode_escape_decode 1861 for c, d in ('x', 2), ('u', 4), ('U', 4): 1862 for i in range(d): 1863 self.assertRaises(UnicodeDecodeError, decode, 1864 "\\" + c + "0"*i) 1865 self.assertRaises(UnicodeDecodeError, decode, 1866 "[\\" + c + "0"*i + "]") 1867 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i 1868 self.assertEqual(decode(data, "ignore"), (u"[]", len(data))) 1869 self.assertEqual(decode(data, "replace"), 1870 (u"[\ufffd]\ufffd", len(data))) 1871 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000") 1872 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10)) 1873 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10)) 1874 1875 1876 class RawUnicodeEscapeTest(unittest.TestCase): 1877 def test_empty(self): 1878 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0)) 1879 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0)) 1880 1881 def test_raw_encode(self): 1882 encode = codecs.raw_unicode_escape_encode 1883 for b in range(256): 1884 self.assertEqual(encode(unichr(b)), (chr(b), 1)) 1885 1886 def test_raw_decode(self): 1887 decode = codecs.raw_unicode_escape_decode 1888 for b in range(256): 1889 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2)) 1890 1891 def test_escape_encode(self): 1892 encode = codecs.raw_unicode_escape_encode 1893 check = coding_checker(self, encode) 1894 for b in range(256): 1895 if chr(b) not in 'uU': 1896 check(u'\\' + unichr(b), '\\' + chr(b)) 1897 check(u'\u20ac', r'\u20ac') 1898 check(u'\U0001d120', r'\U0001d120') 1899 1900 def test_escape_decode(self): 1901 decode = codecs.raw_unicode_escape_decode 1902 check = coding_checker(self, decode) 1903 for b in range(256): 1904 if chr(b) not in 'uU': 1905 check('\\' + chr(b), u'\\' + unichr(b)) 1906 check(r"\u20ac", u"\u20ac") 1907 check(r"\U0001d120", u"\U0001d120") 1908 1909 def test_decode_errors(self): 1910 decode = codecs.raw_unicode_escape_decode 1911 for c, d in ('u', 4), ('U', 4): 1912 for i in range(d): 1913 self.assertRaises(UnicodeDecodeError, decode, 1914 "\\" + c + "0"*i) 1915 self.assertRaises(UnicodeDecodeError, decode, 1916 "[\\" + c + "0"*i + "]") 1917 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i 1918 self.assertEqual(decode(data, "ignore"), (u"[]", len(data))) 1919 self.assertEqual(decode(data, "replace"), 1920 (u"[\ufffd]\ufffd", len(data))) 1921 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000") 1922 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10)) 1923 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10)) 1924 1925 1926 class BomTest(unittest.TestCase): 1927 def test_seek0(self): 1928 data = u"1234567890" 1929 tests = ("utf-16", 1930 "utf-16-le", 1931 "utf-16-be", 1932 "utf-32", 1933 "utf-32-le", 1934 "utf-32-be") 1935 self.addCleanup(test_support.unlink, test_support.TESTFN) 1936 for encoding in tests: 1937 # Check if the BOM is written only once 1938 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 1939 f.write(data) 1940 f.write(data) 1941 f.seek(0) 1942 self.assertEqual(f.read(), data * 2) 1943 f.seek(0) 1944 self.assertEqual(f.read(), data * 2) 1945 1946 # Check that the BOM is written after a seek(0) 1947 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 1948 f.write(data[0]) 1949 self.assertNotEqual(f.tell(), 0) 1950 f.seek(0) 1951 f.write(data) 1952 f.seek(0) 1953 self.assertEqual(f.read(), data) 1954 1955 # (StreamWriter) Check that the BOM is written after a seek(0) 1956 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 1957 f.writer.write(data[0]) 1958 self.assertNotEqual(f.writer.tell(), 0) 1959 f.writer.seek(0) 1960 f.writer.write(data) 1961 f.seek(0) 1962 self.assertEqual(f.read(), data) 1963 1964 # Check that the BOM is not written after a seek() at a position 1965 # different than the start 1966 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 1967 f.write(data) 1968 f.seek(f.tell()) 1969 f.write(data) 1970 f.seek(0) 1971 self.assertEqual(f.read(), data * 2) 1972 1973 # (StreamWriter) Check that the BOM is not written after a seek() 1974 # at a position different than the start 1975 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f: 1976 f.writer.write(data) 1977 f.writer.seek(f.writer.tell()) 1978 f.writer.write(data) 1979 f.seek(0) 1980 self.assertEqual(f.read(), data * 2) 1981 1982 1983 def test_main(): 1984 test_support.run_unittest( 1985 UTF32Test, 1986 UTF32LETest, 1987 UTF32BETest, 1988 UTF16Test, 1989 UTF16LETest, 1990 UTF16BETest, 1991 UTF8Test, 1992 UTF8SigTest, 1993 UTF7Test, 1994 UTF16ExTest, 1995 ReadBufferTest, 1996 CharBufferTest, 1997 EscapeDecodeTest, 1998 RecodingTest, 1999 PunycodeTest, 2000 UnicodeInternalTest, 2001 NameprepTest, 2002 IDNACodecTest, 2003 CodecsModuleTest, 2004 StreamReaderTest, 2005 EncodedFileTest, 2006 Str2StrTest, 2007 BasicUnicodeTest, 2008 BasicStrTest, 2009 CharmapTest, 2010 WithStmtTest, 2011 UnicodeEscapeTest, 2012 RawUnicodeEscapeTest, 2013 BomTest, 2014 ) 2015 2016 2017 if __name__ == "__main__": 2018 test_main() 2019