1 import difflib 2 from test.support import run_unittest, findfile 3 import unittest 4 import doctest 5 import sys 6 7 8 class TestWithAscii(unittest.TestCase): 9 def test_one_insert(self): 10 sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100) 11 self.assertAlmostEqual(sm.ratio(), 0.995, places=3) 12 self.assertEqual(list(sm.get_opcodes()), 13 [ ('insert', 0, 0, 0, 1), 14 ('equal', 0, 100, 1, 101)]) 15 self.assertEqual(sm.bpopular, set()) 16 sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) 17 self.assertAlmostEqual(sm.ratio(), 0.995, places=3) 18 self.assertEqual(list(sm.get_opcodes()), 19 [ ('equal', 0, 50, 0, 50), 20 ('insert', 50, 50, 50, 51), 21 ('equal', 50, 100, 51, 101)]) 22 self.assertEqual(sm.bpopular, set()) 23 24 def test_one_delete(self): 25 sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) 26 self.assertAlmostEqual(sm.ratio(), 0.994, places=3) 27 self.assertEqual(list(sm.get_opcodes()), 28 [ ('equal', 0, 40, 0, 40), 29 ('delete', 40, 41, 40, 40), 30 ('equal', 41, 81, 40, 80)]) 31 32 def test_bjunk(self): 33 sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', 34 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40) 35 self.assertEqual(sm.bjunk, set()) 36 37 sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', 38 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) 39 self.assertEqual(sm.bjunk, {' '}) 40 41 sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'], 42 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) 43 self.assertEqual(sm.bjunk, {' ', 'b'}) 44 45 46 class TestAutojunk(unittest.TestCase): 47 """Tests for the autojunk parameter added in 2.7""" 48 def test_one_insert_homogenous_sequence(self): 49 # By default autojunk=True and the heuristic kicks in for a sequence 50 # of length 200+ 51 seq1 = 'b' * 200 52 seq2 = 'a' + 'b' * 200 53 54 sm = difflib.SequenceMatcher(None, seq1, seq2) 55 self.assertAlmostEqual(sm.ratio(), 0, places=3) 56 self.assertEqual(sm.bpopular, {'b'}) 57 58 # Now turn the heuristic off 59 sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False) 60 self.assertAlmostEqual(sm.ratio(), 0.9975, places=3) 61 self.assertEqual(sm.bpopular, set()) 62 63 64 class TestSFbugs(unittest.TestCase): 65 def test_ratio_for_null_seqn(self): 66 # Check clearing of SF bug 763023 67 s = difflib.SequenceMatcher(None, [], []) 68 self.assertEqual(s.ratio(), 1) 69 self.assertEqual(s.quick_ratio(), 1) 70 self.assertEqual(s.real_quick_ratio(), 1) 71 72 def test_comparing_empty_lists(self): 73 # Check fix for bug #979794 74 group_gen = difflib.SequenceMatcher(None, [], []).get_grouped_opcodes() 75 self.assertRaises(StopIteration, next, group_gen) 76 diff_gen = difflib.unified_diff([], []) 77 self.assertRaises(StopIteration, next, diff_gen) 78 79 def test_matching_blocks_cache(self): 80 # Issue #21635 81 s = difflib.SequenceMatcher(None, "abxcd", "abcd") 82 first = s.get_matching_blocks() 83 second = s.get_matching_blocks() 84 self.assertEqual(second[0].size, 2) 85 self.assertEqual(second[1].size, 2) 86 self.assertEqual(second[2].size, 0) 87 88 def test_added_tab_hint(self): 89 # Check fix for bug #1488943 90 diff = list(difflib.Differ().compare(["\tI am a buggy"],["\t\tI am a bug"])) 91 self.assertEqual("- \tI am a buggy", diff[0]) 92 self.assertEqual("? --\n", diff[1]) 93 self.assertEqual("+ \t\tI am a bug", diff[2]) 94 self.assertEqual("? +\n", diff[3]) 95 96 def test_mdiff_catch_stop_iteration(self): 97 # Issue #33224 98 self.assertEqual( 99 list(difflib._mdiff(["2"], ["3"], 1)), 100 [((1, '\x00-2\x01'), (1, '\x00+3\x01'), True)], 101 ) 102 103 104 patch914575_from1 = """ 105 1. Beautiful is beTTer than ugly. 106 2. Explicit is better than implicit. 107 3. Simple is better than complex. 108 4. Complex is better than complicated. 109 """ 110 111 patch914575_to1 = """ 112 1. Beautiful is better than ugly. 113 3. Simple is better than complex. 114 4. Complicated is better than complex. 115 5. Flat is better than nested. 116 """ 117 118 patch914575_nonascii_from1 = """ 119 1. Beautiful is beTTer than ugly. 120 2. Explicit is better than mplct. 121 3. Simple is better than complex. 122 4. Complex is better than complicated. 123 """ 124 125 patch914575_nonascii_to1 = """ 126 1. Beautiful is better than gly. 127 3. Smple is better than complex. 128 4. Complicated is better than cmplex. 129 5. Flat is better than nested. 130 """ 131 132 patch914575_from2 = """ 133 \t\tLine 1: preceded by from:[tt] to:[ssss] 134 \t\tLine 2: preceded by from:[sstt] to:[sssst] 135 \t \tLine 3: preceded by from:[sstst] to:[ssssss] 136 Line 4: \thas from:[sst] to:[sss] after : 137 Line 5: has from:[t] to:[ss] at end\t 138 """ 139 140 patch914575_to2 = """ 141 Line 1: preceded by from:[tt] to:[ssss] 142 \tLine 2: preceded by from:[sstt] to:[sssst] 143 Line 3: preceded by from:[sstst] to:[ssssss] 144 Line 4: has from:[sst] to:[sss] after : 145 Line 5: has from:[t] to:[ss] at end 146 """ 147 148 patch914575_from3 = """line 0 149 1234567890123456789012345689012345 150 line 1 151 line 2 152 line 3 153 line 4 changed 154 line 5 changed 155 line 6 changed 156 line 7 157 line 8 subtracted 158 line 9 159 1234567890123456789012345689012345 160 short line 161 just fits in!! 162 just fits in two lines yup!! 163 the end""" 164 165 patch914575_to3 = """line 0 166 1234567890123456789012345689012345 167 line 1 168 line 2 added 169 line 3 170 line 4 chanGEd 171 line 5a chanGed 172 line 6a changEd 173 line 7 174 line 8 175 line 9 176 1234567890 177 another long line that needs to be wrapped 178 just fitS in!! 179 just fits in two lineS yup!! 180 the end""" 181 182 class TestSFpatches(unittest.TestCase): 183 184 def test_html_diff(self): 185 # Check SF patch 914575 for generating HTML differences 186 f1a = ((patch914575_from1 + '123\n'*10)*3) 187 t1a = (patch914575_to1 + '123\n'*10)*3 188 f1b = '456\n'*10 + f1a 189 t1b = '456\n'*10 + t1a 190 f1a = f1a.splitlines() 191 t1a = t1a.splitlines() 192 f1b = f1b.splitlines() 193 t1b = t1b.splitlines() 194 f2 = patch914575_from2.splitlines() 195 t2 = patch914575_to2.splitlines() 196 f3 = patch914575_from3 197 t3 = patch914575_to3 198 i = difflib.HtmlDiff() 199 j = difflib.HtmlDiff(tabsize=2) 200 k = difflib.HtmlDiff(wrapcolumn=14) 201 202 full = i.make_file(f1a,t1a,'from','to',context=False,numlines=5) 203 tables = '\n'.join( 204 [ 205 '<h2>Context (first diff within numlines=5(default))</h2>', 206 i.make_table(f1a,t1a,'from','to',context=True), 207 '<h2>Context (first diff after numlines=5(default))</h2>', 208 i.make_table(f1b,t1b,'from','to',context=True), 209 '<h2>Context (numlines=6)</h2>', 210 i.make_table(f1a,t1a,'from','to',context=True,numlines=6), 211 '<h2>Context (numlines=0)</h2>', 212 i.make_table(f1a,t1a,'from','to',context=True,numlines=0), 213 '<h2>Same Context</h2>', 214 i.make_table(f1a,f1a,'from','to',context=True), 215 '<h2>Same Full</h2>', 216 i.make_table(f1a,f1a,'from','to',context=False), 217 '<h2>Empty Context</h2>', 218 i.make_table([],[],'from','to',context=True), 219 '<h2>Empty Full</h2>', 220 i.make_table([],[],'from','to',context=False), 221 '<h2>tabsize=2</h2>', 222 j.make_table(f2,t2), 223 '<h2>tabsize=default</h2>', 224 i.make_table(f2,t2), 225 '<h2>Context (wrapcolumn=14,numlines=0)</h2>', 226 k.make_table(f3.splitlines(),t3.splitlines(),context=True,numlines=0), 227 '<h2>wrapcolumn=14,splitlines()</h2>', 228 k.make_table(f3.splitlines(),t3.splitlines()), 229 '<h2>wrapcolumn=14,splitlines(True)</h2>', 230 k.make_table(f3.splitlines(True),t3.splitlines(True)), 231 ]) 232 actual = full.replace('</body>','\n%s\n</body>' % tables) 233 234 # temporarily uncomment next two lines to baseline this test 235 #with open('test_difflib_expect.html','w') as fp: 236 # fp.write(actual) 237 238 with open(findfile('test_difflib_expect.html')) as fp: 239 self.assertEqual(actual, fp.read()) 240 241 def test_recursion_limit(self): 242 # Check if the problem described in patch #1413711 exists. 243 limit = sys.getrecursionlimit() 244 old = [(i%2 and "K:%d" or "V:A:%d") % i for i in range(limit*2)] 245 new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)] 246 difflib.SequenceMatcher(None, old, new).get_opcodes() 247 248 def test_make_file_default_charset(self): 249 html_diff = difflib.HtmlDiff() 250 output = html_diff.make_file(patch914575_from1.splitlines(), 251 patch914575_to1.splitlines()) 252 self.assertIn('content="text/html; charset=utf-8"', output) 253 254 def test_make_file_iso88591_charset(self): 255 html_diff = difflib.HtmlDiff() 256 output = html_diff.make_file(patch914575_from1.splitlines(), 257 patch914575_to1.splitlines(), 258 charset='iso-8859-1') 259 self.assertIn('content="text/html; charset=iso-8859-1"', output) 260 261 def test_make_file_usascii_charset_with_nonascii_input(self): 262 html_diff = difflib.HtmlDiff() 263 output = html_diff.make_file(patch914575_nonascii_from1.splitlines(), 264 patch914575_nonascii_to1.splitlines(), 265 charset='us-ascii') 266 self.assertIn('content="text/html; charset=us-ascii"', output) 267 self.assertIn('ımplıcıt', output) 268 269 270 class TestOutputFormat(unittest.TestCase): 271 def test_tab_delimiter(self): 272 args = ['one', 'two', 'Original', 'Current', 273 '2005-01-26 23:30:50', '2010-04-02 10:20:52'] 274 ud = difflib.unified_diff(*args, lineterm='') 275 self.assertEqual(list(ud)[0:2], [ 276 "--- Original\t2005-01-26 23:30:50", 277 "+++ Current\t2010-04-02 10:20:52"]) 278 cd = difflib.context_diff(*args, lineterm='') 279 self.assertEqual(list(cd)[0:2], [ 280 "*** Original\t2005-01-26 23:30:50", 281 "--- Current\t2010-04-02 10:20:52"]) 282 283 def test_no_trailing_tab_on_empty_filedate(self): 284 args = ['one', 'two', 'Original', 'Current'] 285 ud = difflib.unified_diff(*args, lineterm='') 286 self.assertEqual(list(ud)[0:2], ["--- Original", "+++ Current"]) 287 288 cd = difflib.context_diff(*args, lineterm='') 289 self.assertEqual(list(cd)[0:2], ["*** Original", "--- Current"]) 290 291 def test_range_format_unified(self): 292 # Per the diff spec at http://www.unix.org/single_unix_specification/ 293 spec = '''\ 294 Each <range> field shall be of the form: 295 %1d", <beginning line number> if the range contains exactly one line, 296 and: 297 "%1d,%1d", <beginning line number>, <number of lines> otherwise. 298 If a range is empty, its beginning line number shall be the number of 299 the line just before the range, or 0 if the empty range starts the file. 300 ''' 301 fmt = difflib._format_range_unified 302 self.assertEqual(fmt(3,3), '3,0') 303 self.assertEqual(fmt(3,4), '4') 304 self.assertEqual(fmt(3,5), '4,2') 305 self.assertEqual(fmt(3,6), '4,3') 306 self.assertEqual(fmt(0,0), '0,0') 307 308 def test_range_format_context(self): 309 # Per the diff spec at http://www.unix.org/single_unix_specification/ 310 spec = '''\ 311 The range of lines in file1 shall be written in the following format 312 if the range contains two or more lines: 313 "*** %d,%d ****\n", <beginning line number>, <ending line number> 314 and the following format otherwise: 315 "*** %d ****\n", <ending line number> 316 The ending line number of an empty range shall be the number of the preceding line, 317 or 0 if the range is at the start of the file. 318 319 Next, the range of lines in file2 shall be written in the following format 320 if the range contains two or more lines: 321 "--- %d,%d ----\n", <beginning line number>, <ending line number> 322 and the following format otherwise: 323 "--- %d ----\n", <ending line number> 324 ''' 325 fmt = difflib._format_range_context 326 self.assertEqual(fmt(3,3), '3') 327 self.assertEqual(fmt(3,4), '4') 328 self.assertEqual(fmt(3,5), '4,5') 329 self.assertEqual(fmt(3,6), '4,6') 330 self.assertEqual(fmt(0,0), '0') 331 332 333 class TestBytes(unittest.TestCase): 334 # don't really care about the content of the output, just the fact 335 # that it's bytes and we don't crash 336 def check(self, diff): 337 diff = list(diff) # trigger exceptions first 338 for line in diff: 339 self.assertIsInstance( 340 line, bytes, 341 "all lines of diff should be bytes, but got: %r" % line) 342 343 def test_byte_content(self): 344 # if we receive byte strings, we return byte strings 345 a = [b'hello', b'andr\xe9'] # iso-8859-1 bytes 346 b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes 347 348 unified = difflib.unified_diff 349 context = difflib.context_diff 350 351 check = self.check 352 check(difflib.diff_bytes(unified, a, a)) 353 check(difflib.diff_bytes(unified, a, b)) 354 355 # now with filenames (content and filenames are all bytes!) 356 check(difflib.diff_bytes(unified, a, a, b'a', b'a')) 357 check(difflib.diff_bytes(unified, a, b, b'a', b'b')) 358 359 # and with filenames and dates 360 check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) 361 check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) 362 363 # same all over again, with context diff 364 check(difflib.diff_bytes(context, a, a)) 365 check(difflib.diff_bytes(context, a, b)) 366 check(difflib.diff_bytes(context, a, a, b'a', b'a')) 367 check(difflib.diff_bytes(context, a, b, b'a', b'b')) 368 check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) 369 check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013')) 370 371 def test_byte_filenames(self): 372 # somebody renamed a file from ISO-8859-2 to UTF-8 373 fna = b'\xb3odz.txt' # "odz.txt" 374 fnb = b'\xc5\x82odz.txt' 375 376 # they transcoded the content at the same time 377 a = [b'\xa3odz is a city in Poland.'] 378 b = [b'\xc5\x81odz is a city in Poland.'] 379 380 check = self.check 381 unified = difflib.unified_diff 382 context = difflib.context_diff 383 check(difflib.diff_bytes(unified, a, b, fna, fnb)) 384 check(difflib.diff_bytes(context, a, b, fna, fnb)) 385 386 def assertDiff(expect, actual): 387 # do not compare expect and equal as lists, because unittest 388 # uses difflib to report difference between lists 389 actual = list(actual) 390 self.assertEqual(len(expect), len(actual)) 391 for e, a in zip(expect, actual): 392 self.assertEqual(e, a) 393 394 expect = [ 395 b'--- \xb3odz.txt', 396 b'+++ \xc5\x82odz.txt', 397 b'@@ -1 +1 @@', 398 b'-\xa3odz is a city in Poland.', 399 b'+\xc5\x81odz is a city in Poland.', 400 ] 401 actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') 402 assertDiff(expect, actual) 403 404 # with dates (plain ASCII) 405 datea = b'2005-03-18' 406 dateb = b'2005-03-19' 407 check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) 408 check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) 409 410 expect = [ 411 # note the mixed encodings here: this is deeply wrong by every 412 # tenet of Unicode, but it doesn't crash, it's parseable by 413 # patch, and it's how UNIX(tm) diff behaves 414 b'--- \xb3odz.txt\t2005-03-18', 415 b'+++ \xc5\x82odz.txt\t2005-03-19', 416 b'@@ -1 +1 @@', 417 b'-\xa3odz is a city in Poland.', 418 b'+\xc5\x81odz is a city in Poland.', 419 ] 420 actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, 421 lineterm=b'') 422 assertDiff(expect, actual) 423 424 def test_mixed_types_content(self): 425 # type of input content must be consistent: all str or all bytes 426 a = [b'hello'] 427 b = ['hello'] 428 429 unified = difflib.unified_diff 430 context = difflib.context_diff 431 432 expect = "lines to compare must be str, not bytes (b'hello')" 433 self._assert_type_error(expect, unified, a, b) 434 self._assert_type_error(expect, unified, b, a) 435 self._assert_type_error(expect, context, a, b) 436 self._assert_type_error(expect, context, b, a) 437 438 expect = "all arguments must be bytes, not str ('hello')" 439 self._assert_type_error(expect, difflib.diff_bytes, unified, a, b) 440 self._assert_type_error(expect, difflib.diff_bytes, unified, b, a) 441 self._assert_type_error(expect, difflib.diff_bytes, context, a, b) 442 self._assert_type_error(expect, difflib.diff_bytes, context, b, a) 443 444 def test_mixed_types_filenames(self): 445 # cannot pass filenames as bytes if content is str (this may not be 446 # the right behaviour, but at least the test demonstrates how 447 # things work) 448 a = ['hello\n'] 449 b = ['ohell\n'] 450 fna = b'ol\xe9.txt' # filename transcoded from ISO-8859-1 451 fnb = b'ol\xc3a9.txt' # to UTF-8 452 self._assert_type_error( 453 "all arguments must be str, not: b'ol\\xe9.txt'", 454 difflib.unified_diff, a, b, fna, fnb) 455 456 def test_mixed_types_dates(self): 457 # type of dates must be consistent with type of contents 458 a = [b'foo\n'] 459 b = [b'bar\n'] 460 datea = '1 fv' 461 dateb = '3 fv' 462 self._assert_type_error( 463 "all arguments must be bytes, not str ('1 fv')", 464 difflib.diff_bytes, difflib.unified_diff, 465 a, b, b'a', b'b', datea, dateb) 466 467 # if input is str, non-ASCII dates are fine 468 a = ['foo\n'] 469 b = ['bar\n'] 470 list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb)) 471 472 def _assert_type_error(self, msg, generator, *args): 473 with self.assertRaises(TypeError) as ctx: 474 list(generator(*args)) 475 self.assertEqual(msg, str(ctx.exception)) 476 477 class TestJunkAPIs(unittest.TestCase): 478 def test_is_line_junk_true(self): 479 for line in ['#', ' ', ' #', '# ', ' # ', '']: 480 self.assertTrue(difflib.IS_LINE_JUNK(line), repr(line)) 481 482 def test_is_line_junk_false(self): 483 for line in ['##', ' ##', '## ', 'abc ', 'abc #', 'Mr. Moose is up!']: 484 self.assertFalse(difflib.IS_LINE_JUNK(line), repr(line)) 485 486 def test_is_line_junk_REDOS(self): 487 evil_input = ('\t' * 1000000) + '##' 488 self.assertFalse(difflib.IS_LINE_JUNK(evil_input)) 489 490 def test_is_character_junk_true(self): 491 for char in [' ', '\t']: 492 self.assertTrue(difflib.IS_CHARACTER_JUNK(char), repr(char)) 493 494 def test_is_character_junk_false(self): 495 for char in ['a', '#', '\n', '\f', '\r', '\v']: 496 self.assertFalse(difflib.IS_CHARACTER_JUNK(char), repr(char)) 497 498 def test_main(): 499 difflib.HtmlDiff._default_prefix = 0 500 Doctests = doctest.DocTestSuite(difflib) 501 run_unittest( 502 TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, 503 TestOutputFormat, TestBytes, TestJunkAPIs, Doctests) 504 505 if __name__ == '__main__': 506 test_main() 507