1 #!/usr/bin/env python 2 3 from __future__ import print_function 4 import sys, os, re, difflib, unicodedata, errno, cgi 5 from itertools import * 6 7 diff_symbols = "-+=*&^%$#@!~/" 8 diff_colors = ['red', 'green', 'blue'] 9 10 if sys.version_info[0] >= 3: 11 unichr = chr 12 13 class ColorFormatter: 14 15 class Null: 16 @staticmethod 17 def start_color (c): return '' 18 @staticmethod 19 def end_color (): return '' 20 @staticmethod 21 def escape (s): return s 22 @staticmethod 23 def newline (): return '\n' 24 25 class ANSI: 26 @staticmethod 27 def start_color (c): 28 return { 29 'red': '\033[41;37;1m', 30 'green': '\033[42;37;1m', 31 'blue': '\033[44;37;1m', 32 }[c] 33 @staticmethod 34 def end_color (): 35 return '\033[m' 36 @staticmethod 37 def escape (s): return s 38 @staticmethod 39 def newline (): return '\n' 40 41 class HTML: 42 @staticmethod 43 def start_color (c): 44 return '<span style="background:%s">' % c 45 @staticmethod 46 def end_color (): 47 return '</span>' 48 @staticmethod 49 def escape (s): return cgi.escape (s) 50 @staticmethod 51 def newline (): return '<br/>\n' 52 53 @staticmethod 54 def Auto (argv = [], out = sys.stdout): 55 format = ColorFormatter.ANSI 56 if "--format" in argv: 57 argv.remove ("--format") 58 format = ColorFormatter.ANSI 59 if "--format=ansi" in argv: 60 argv.remove ("--format=ansi") 61 format = ColorFormatter.ANSI 62 if "--format=html" in argv: 63 argv.remove ("--format=html") 64 format = ColorFormatter.HTML 65 if "--no-format" in argv: 66 argv.remove ("--no-format") 67 format = ColorFormatter.Null 68 return format 69 70 71 class DiffColorizer: 72 73 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)') 74 75 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols): 76 self.formatter = formatter 77 self.colors = colors 78 self.symbols = symbols 79 80 def colorize_lines (self, lines): 81 lines = (l if l else '' for l in lines) 82 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines] 83 oo = ["",""] 84 st = [False, False] 85 for l in difflib.Differ().compare (*ss): 86 if l[0] == '?': 87 continue 88 if l[0] == ' ': 89 for i in range(2): 90 if st[i]: 91 oo[i] += self.formatter.end_color () 92 st[i] = False 93 oo = [o + self.formatter.escape (l[2:]) for o in oo] 94 continue 95 if l[0] in self.symbols: 96 i = self.symbols.index (l[0]) 97 if not st[i]: 98 oo[i] += self.formatter.start_color (self.colors[i]) 99 st[i] = True 100 oo[i] += self.formatter.escape (l[2:]) 101 continue 102 for i in range(2): 103 if st[i]: 104 oo[i] += self.formatter.end_color () 105 st[i] = False 106 oo = [o.replace ('\n', '') for o in oo] 107 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2] 108 109 def colorize_diff (self, f): 110 lines = [None, None] 111 for l in f: 112 if l[0] not in self.symbols: 113 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ()) 114 continue 115 i = self.symbols.index (l[0]) 116 if lines[i]: 117 # Flush 118 for line in self.colorize_lines (lines): 119 yield line 120 lines = [None, None] 121 lines[i] = l[1:] 122 if (all (lines)): 123 # Flush 124 for line in self.colorize_lines (lines): 125 yield line 126 lines = [None, None] 127 if (any (lines)): 128 # Flush 129 for line in self.colorize_lines (lines): 130 yield line 131 132 133 class ZipDiffer: 134 135 @staticmethod 136 def diff_files (files, symbols=diff_symbols): 137 files = tuple (files) # in case it's a generator, copy it 138 try: 139 for lines in izip_longest (*files): 140 if all (lines[0] == line for line in lines[1:]): 141 sys.stdout.writelines ([" ", lines[0]]) 142 continue 143 144 for i, l in enumerate (lines): 145 if l: 146 sys.stdout.writelines ([symbols[i], l]) 147 except IOError as e: 148 if e.errno != errno.EPIPE: 149 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 150 sys.exit (1) 151 152 153 class DiffFilters: 154 155 @staticmethod 156 def filter_failures (f): 157 for key, lines in DiffHelpers.separate_test_cases (f): 158 lines = list (lines) 159 if not DiffHelpers.test_passed (lines): 160 for l in lines: yield l 161 162 class Stat: 163 164 def __init__ (self): 165 self.count = 0 166 self.freq = 0 167 168 def add (self, test): 169 self.count += 1 170 self.freq += test.freq 171 172 class Stats: 173 174 def __init__ (self): 175 self.passed = Stat () 176 self.failed = Stat () 177 self.total = Stat () 178 179 def add (self, test): 180 self.total.add (test) 181 if test.passed: 182 self.passed.add (test) 183 else: 184 self.failed.add (test) 185 186 def mean (self): 187 return float (self.passed.count) / self.total.count 188 189 def variance (self): 190 return (float (self.passed.count) / self.total.count) * \ 191 (float (self.failed.count) / self.total.count) 192 193 def stddev (self): 194 return self.variance () ** .5 195 196 def zscore (self, population): 197 """Calculate the standard score. 198 Population is the Stats for population. 199 Self is Stats for sample. 200 Returns larger absolute value if sample is highly unlikely to be random. 201 Anything outside of -3..+3 is very unlikely to be random. 202 See: http://en.wikipedia.org/wiki/Standard_score""" 203 204 return (self.mean () - population.mean ()) / population.stddev () 205 206 207 208 209 class DiffSinks: 210 211 @staticmethod 212 def print_stat (f): 213 passed = 0 214 failed = 0 215 # XXX port to Stats, but that would really slow us down here 216 for key, lines in DiffHelpers.separate_test_cases (f): 217 if DiffHelpers.test_passed (lines): 218 passed += 1 219 else: 220 failed += 1 221 total = passed + failed 222 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)) 223 224 @staticmethod 225 def print_ngrams (f, ns=(1,2,3)): 226 gens = tuple (Ngram.generator (n) for n in ns) 227 allstats = Stats () 228 allgrams = {} 229 for key, lines in DiffHelpers.separate_test_cases (f): 230 test = Test (lines) 231 allstats.add (test) 232 233 for gen in gens: 234 for ngram in gen (test.unicodes): 235 if ngram not in allgrams: 236 allgrams[ngram] = Stats () 237 allgrams[ngram].add (test) 238 239 importantgrams = {} 240 for ngram, stats in allgrams.iteritems (): 241 if stats.failed.count >= 30: # for statistical reasons 242 importantgrams[ngram] = stats 243 allgrams = importantgrams 244 del importantgrams 245 246 for ngram, stats in allgrams.iteritems (): 247 print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))) 248 249 250 251 class Test: 252 253 def __init__ (self, lines): 254 self.freq = 1 255 self.passed = True 256 self.identifier = None 257 self.text = None 258 self.unicodes = None 259 self.glyphs = None 260 for l in lines: 261 symbol = l[0] 262 if symbol != ' ': 263 self.passed = False 264 i = 1 265 if ':' in l: 266 i = l.index (':') 267 if not self.identifier: 268 self.identifier = l[1:i] 269 i = i + 2 # Skip colon and space 270 j = -1 271 if l[j] == '\n': 272 j -= 1 273 brackets = l[i] + l[j] 274 l = l[i+1:-2] 275 if brackets == '()': 276 self.text = l 277 elif brackets == '<>': 278 self.unicodes = Unicode.parse (l) 279 elif brackets == '[]': 280 # XXX we don't handle failed tests here 281 self.glyphs = l 282 283 284 class DiffHelpers: 285 286 @staticmethod 287 def separate_test_cases (f): 288 '''Reads lines from f, and if the lines have identifiers, ie. 289 have a colon character, groups them by identifier, 290 yielding lists of all lines with the same identifier.''' 291 292 def identifier (l): 293 if ':' in l[1:]: 294 return l[1:l.index (':')] 295 return l 296 return groupby (f, key=identifier) 297 298 @staticmethod 299 def test_passed (lines): 300 lines = list (lines) 301 # XXX This is a hack, but does the job for now. 302 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True 303 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True 304 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True 305 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True 306 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True 307 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True 308 return all (l[0] == ' ' for l in lines) 309 310 311 class FilterHelpers: 312 313 @staticmethod 314 def filter_printer_function (filter_callback): 315 def printer (f): 316 for line in filter_callback (f): 317 print (line) 318 return printer 319 320 @staticmethod 321 def filter_printer_function_no_newline (filter_callback): 322 def printer (f): 323 for line in filter_callback (f): 324 sys.stdout.writelines ([line]) 325 return printer 326 327 328 class Ngram: 329 330 @staticmethod 331 def generator (n): 332 333 def gen (f): 334 l = [] 335 for x in f: 336 l.append (x) 337 if len (l) == n: 338 yield tuple (l) 339 l[:1] = [] 340 341 gen.n = n 342 return gen 343 344 345 class UtilMains: 346 347 @staticmethod 348 def process_multiple_files (callback, mnemonic = "FILE"): 349 350 if "--help" in sys.argv: 351 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 352 sys.exit (1) 353 354 try: 355 files = sys.argv[1:] if len (sys.argv) > 1 else ['-'] 356 for s in files: 357 callback (FileHelpers.open_file_or_stdin (s)) 358 except IOError as e: 359 if e.errno != errno.EPIPE: 360 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 361 sys.exit (1) 362 363 @staticmethod 364 def process_multiple_args (callback, mnemonic): 365 366 if len (sys.argv) == 1 or "--help" in sys.argv: 367 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 368 sys.exit (1) 369 370 try: 371 for s in sys.argv[1:]: 372 callback (s) 373 except IOError as e: 374 if e.errno != errno.EPIPE: 375 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 376 sys.exit (1) 377 378 @staticmethod 379 def filter_multiple_strings_or_stdin (callback, mnemonic, \ 380 separator = " ", \ 381 concat_separator = False): 382 383 if "--help" in sys.argv: 384 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \ 385 % (sys.argv[0], mnemonic, sys.argv[0])) 386 sys.exit (1) 387 388 try: 389 if len (sys.argv) == 1: 390 while (1): 391 line = sys.stdin.readline () 392 if not len (line): 393 break 394 if line[-1] == '\n': 395 line = line[:-1] 396 print (callback (line)) 397 else: 398 args = sys.argv[1:] 399 if concat_separator != False: 400 args = [concat_separator.join (args)] 401 print (separator.join (callback (x) for x in (args))) 402 except IOError as e: 403 if e.errno != errno.EPIPE: 404 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 405 sys.exit (1) 406 407 408 class Unicode: 409 410 @staticmethod 411 def decode (s): 412 return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') 413 414 @staticmethod 415 def parse (s): 416 s = re.sub (r"0[xX]", " ", s) 417 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n ]", " ", s) 418 return [int (x, 16) for x in s.split ()] 419 420 @staticmethod 421 def encode (s): 422 s = u''.join (unichr (x) for x in Unicode.parse (s)) 423 if sys.version_info[0] == 2: s = s.encode ('utf-8') 424 return s 425 426 shorthands = { 427 "ZERO WIDTH NON-JOINER": "ZWNJ", 428 "ZERO WIDTH JOINER": "ZWJ", 429 "NARROW NO-BREAK SPACE": "NNBSP", 430 "COMBINING GRAPHEME JOINER": "CGJ", 431 "LEFT-TO-RIGHT MARK": "LRM", 432 "RIGHT-TO-LEFT MARK": "RLM", 433 "LEFT-TO-RIGHT EMBEDDING": "LRE", 434 "RIGHT-TO-LEFT EMBEDDING": "RLE", 435 "POP DIRECTIONAL FORMATTING": "PDF", 436 "LEFT-TO-RIGHT OVERRIDE": "LRO", 437 "RIGHT-TO-LEFT OVERRIDE": "RLO", 438 } 439 440 @staticmethod 441 def pretty_name (u): 442 try: 443 s = unicodedata.name (u) 444 except ValueError: 445 return "XXX" 446 s = re.sub (".* LETTER ", "", s) 447 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s) 448 s = re.sub (".* SIGN ", "", s) 449 s = re.sub (".* COMBINING ", "", s) 450 if re.match (".* VIRAMA", s): 451 s = "HALANT" 452 if s in Unicode.shorthands: 453 s = Unicode.shorthands[s] 454 return s 455 456 @staticmethod 457 def pretty_names (s): 458 s = re.sub (r"[<+>\\uU]", " ", s) 459 s = re.sub (r"0[xX]", " ", s) 460 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)] 461 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8') 462 463 464 class FileHelpers: 465 466 @staticmethod 467 def open_file_or_stdin (f): 468 if f == '-': 469 return sys.stdin 470 return file (f) 471 472 473 class Manifest: 474 475 @staticmethod 476 def read (s, strict = True): 477 478 if not os.path.exists (s): 479 if strict: 480 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr) 481 sys.exit (1) 482 return 483 484 s = os.path.normpath (s) 485 486 if os.path.isdir (s): 487 488 try: 489 m = file (os.path.join (s, "MANIFEST")) 490 items = [x.strip () for x in m.readlines ()] 491 for f in items: 492 for p in Manifest.read (os.path.join (s, f)): 493 yield p 494 except IOError: 495 if strict: 496 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr) 497 sys.exit (1) 498 return 499 else: 500 yield s 501 502 @staticmethod 503 def update_recursive (s): 504 505 for dirpath, dirnames, filenames in os.walk (s, followlinks=True): 506 507 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]: 508 if f in dirnames: 509 dirnames.remove (f) 510 if f in filenames: 511 filenames.remove (f) 512 dirnames.sort () 513 filenames.sort () 514 ms = os.path.join (dirpath, "MANIFEST") 515 print (" GEN %s" % ms) 516 m = open (ms, "w") 517 for f in filenames: 518 print (f, file=m) 519 for f in dirnames: 520 print (f, file=m) 521 for f in dirnames: 522 Manifest.update_recursive (os.path.join (dirpath, f)) 523 524 if __name__ == '__main__': 525 pass 526