Home | History | Annotate | Download | only in shaping
      1 #!/usr/bin/python
      2 
      3 import sys, os, re, difflib, unicodedata, errno, cgi
      4 from itertools import *
      5 
      6 diff_symbols = "-+=*&^%$#@!~/"
      7 diff_colors = ['red', 'green', 'blue']
      8 
      9 class ColorFormatter:
     10 
     11 	class Null:
     12 		@staticmethod
     13 		def start_color (c): return ''
     14 		@staticmethod
     15 		def end_color (): return ''
     16 		@staticmethod
     17 		def escape (s): return s
     18 		@staticmethod
     19 		def newline (): return '\n'
     20 
     21 	class ANSI:
     22 		@staticmethod
     23 		def start_color (c):
     24 			return {
     25 				'red': '\033[41;37;1m',
     26 				'green': '\033[42;37;1m',
     27 				'blue': '\033[44;37;1m',
     28 			}[c]
     29 		@staticmethod
     30 		def end_color ():
     31 			return '\033[m'
     32 		@staticmethod
     33 		def escape (s): return s
     34 		@staticmethod
     35 		def newline (): return '\n'
     36 
     37 	class HTML:
     38 		@staticmethod
     39 		def start_color (c):
     40 			return '<span style="background:%s">' % c
     41 		@staticmethod
     42 		def end_color ():
     43 			return '</span>'
     44 		@staticmethod
     45 		def escape (s): return cgi.escape (s)
     46 		@staticmethod
     47 		def newline (): return '<br/>\n'
     48 
     49 	@staticmethod
     50 	def Auto (argv = [], out = sys.stdout):
     51 		format = ColorFormatter.ANSI
     52 		if "--format" in argv:
     53 			argv.remove ("--format")
     54 			format = ColorFormatter.ANSI
     55 		if "--format=ansi" in argv:
     56 			argv.remove ("--format=ansi")
     57 			format = ColorFormatter.ANSI
     58 		if "--format=html" in argv:
     59 			argv.remove ("--format=html")
     60 			format = ColorFormatter.HTML
     61 		if "--no-format" in argv:
     62 			argv.remove ("--no-format")
     63 			format = ColorFormatter.Null
     64 		return format
     65 
     66 
     67 class DiffColorizer:
     68 
     69 	diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
     70 
     71 	def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
     72 		self.formatter = formatter
     73 		self.colors = colors
     74 		self.symbols = symbols
     75 
     76 	def colorize_lines (self, lines):
     77 		lines = (l if l else '' for l in lines)
     78 		ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
     79 		oo = ["",""]
     80 		st = [False, False]
     81 		for l in difflib.Differ().compare (*ss):
     82 			if l[0] == '?':
     83 				continue
     84 			if l[0] == ' ':
     85 				for i in range(2):
     86 					if st[i]:
     87 						oo[i] += self.formatter.end_color ()
     88 						st[i] = False
     89 				oo = [o + self.formatter.escape (l[2:]) for o in oo]
     90 				continue
     91 			if l[0] in self.symbols:
     92 				i = self.symbols.index (l[0])
     93 				if not st[i]:
     94 					oo[i] += self.formatter.start_color (self.colors[i])
     95 					st[i] = True
     96 				oo[i] += self.formatter.escape (l[2:])
     97 				continue
     98 		for i in range(2):
     99 			if st[i]:
    100 				oo[i] += self.formatter.end_color ()
    101 				st[i] = False
    102 		oo = [o.replace ('\n', '') for o in oo]
    103 		return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
    104 
    105 	def colorize_diff (self, f):
    106 		lines = [None, None]
    107 		for l in f:
    108 			if l[0] not in self.symbols:
    109 				yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
    110 				continue
    111 			i = self.symbols.index (l[0])
    112 			if lines[i]:
    113 				# Flush
    114 				for line in self.colorize_lines (lines):
    115 					yield line
    116 				lines = [None, None]
    117 			lines[i] = l[1:]
    118 			if (all (lines)):
    119 				# Flush
    120 				for line in self.colorize_lines (lines):
    121 					yield line
    122 				lines = [None, None]
    123 		if (any (lines)):
    124 			# Flush
    125 			for line in self.colorize_lines (lines):
    126 				yield line
    127 
    128 
    129 class ZipDiffer:
    130 
    131 	@staticmethod
    132 	def diff_files (files, symbols=diff_symbols):
    133 		files = tuple (files) # in case it's a generator, copy it
    134 		try:
    135 			for lines in izip_longest (*files):
    136 				if all (lines[0] == line for line in lines[1:]):
    137 					sys.stdout.writelines ([" ", lines[0]])
    138 					continue
    139 
    140 				for i, l in enumerate (lines):
    141 					if l:
    142 						sys.stdout.writelines ([symbols[i], l])
    143 		except IOError as e:
    144 			if e.errno != errno.EPIPE:
    145 				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
    146 				sys.exit (1)
    147 
    148 
    149 class DiffFilters:
    150 
    151 	@staticmethod
    152 	def filter_failures (f):
    153 		for key, lines in DiffHelpers.separate_test_cases (f):
    154 			lines = list (lines)
    155 			if not DiffHelpers.test_passed (lines):
    156 				for l in lines: yield l
    157 
    158 class Stat:
    159 
    160 	def __init__ (self):
    161 		self.count = 0
    162 		self.freq = 0
    163 
    164 	def add (self, test):
    165 		self.count += 1
    166 		self.freq += test.freq
    167 
    168 class Stats:
    169 
    170 	def __init__ (self):
    171 		self.passed = Stat ()
    172 		self.failed = Stat ()
    173 		self.total  = Stat ()
    174 
    175 	def add (self, test):
    176 		self.total.add (test)
    177 		if test.passed:
    178 			self.passed.add (test)
    179 		else:
    180 			self.failed.add (test)
    181 
    182 	def mean (self):
    183 		return float (self.passed.count) / self.total.count
    184 
    185 	def variance (self):
    186 		return (float (self.passed.count) / self.total.count) * \
    187 		       (float (self.failed.count) / self.total.count)
    188 
    189 	def stddev (self):
    190 		return self.variance () ** .5
    191 
    192 	def zscore (self, population):
    193 		"""Calculate the standard score.
    194 		   Population is the Stats for population.
    195 		   Self is Stats for sample.
    196 		   Returns larger absolute value if sample is highly unlikely to be random.
    197 		   Anything outside of -3..+3 is very unlikely to be random.
    198 		   See: http://en.wikipedia.org/wiki/Standard_score"""
    199 
    200 		return (self.mean () - population.mean ()) / population.stddev ()
    201 
    202 
    203 
    204 
    205 class DiffSinks:
    206 
    207 	@staticmethod
    208 	def print_stat (f):
    209 		passed = 0
    210 		failed = 0
    211 		# XXX port to Stats, but that would really slow us down here
    212 		for key, lines in DiffHelpers.separate_test_cases (f):
    213 			if DiffHelpers.test_passed (lines):
    214 				passed += 1
    215 			else:
    216 				failed += 1
    217 		total = passed + failed
    218 		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
    219 
    220 	@staticmethod
    221 	def print_ngrams (f, ns=(1,2,3)):
    222 		gens = tuple (Ngram.generator (n) for n in ns)
    223 		allstats = Stats ()
    224 		allgrams = {}
    225 		for key, lines in DiffHelpers.separate_test_cases (f):
    226 			test = Test (lines)
    227 			allstats.add (test)
    228 
    229 			for gen in gens:
    230 				for ngram in gen (test.unicodes):
    231 					if ngram not in allgrams:
    232 						allgrams[ngram] = Stats ()
    233 					allgrams[ngram].add (test)
    234 
    235 		importantgrams = {}
    236 		for ngram, stats in allgrams.iteritems ():
    237 			if stats.failed.count >= 30: # for statistical reasons
    238 				importantgrams[ngram] = stats
    239 		allgrams = importantgrams
    240 		del importantgrams
    241 
    242 		for ngram, stats in allgrams.iteritems ():
    243 			print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
    244 
    245 
    246 
    247 class Test:
    248 
    249 	def __init__ (self, lines):
    250 		self.freq = 1
    251 		self.passed = True
    252 		self.identifier = None
    253 		self.text = None
    254 		self.unicodes = None
    255 		self.glyphs = None
    256 		for l in lines:
    257 			symbol = l[0]
    258 			if symbol != ' ':
    259 				self.passed = False
    260 			i = 1
    261 			if ':' in l:
    262 				i = l.index (':')
    263 				if not self.identifier:
    264 					self.identifier = l[1:i]
    265 				i = i + 2 # Skip colon and space
    266 			j = -1
    267 			if l[j] == '\n':
    268 				j -= 1
    269 			brackets = l[i] + l[j]
    270 			l = l[i+1:-2]
    271 			if brackets == '()':
    272 				self.text = l
    273 			elif brackets == '<>':
    274 				self.unicodes = Unicode.parse (l)
    275 			elif brackets == '[]':
    276 				# XXX we don't handle failed tests here
    277 				self.glyphs = l
    278 
    279 
    280 class DiffHelpers:
    281 
    282 	@staticmethod
    283 	def separate_test_cases (f):
    284 		'''Reads lines from f, and if the lines have identifiers, ie.
    285 		   have a colon character, groups them by identifier,
    286 		   yielding lists of all lines with the same identifier.'''
    287 
    288 		def identifier (l):
    289 			if ':' in l[1:]:
    290 				return l[1:l.index (':')]
    291 			return l
    292 		return groupby (f, key=identifier)
    293 
    294 	@staticmethod
    295 	def test_passed (lines):
    296 		lines = list (lines)
    297 		# XXX This is a hack, but does the job for now.
    298 		if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
    299 		if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
    300 		if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
    301 		if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
    302 		if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
    303 		if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
    304 		return all (l[0] == ' ' for l in lines)
    305 
    306 
    307 class FilterHelpers:
    308 
    309 	@staticmethod
    310 	def filter_printer_function (filter_callback):
    311 		def printer (f):
    312 			for line in filter_callback (f):
    313 				print line
    314 		return printer
    315 
    316 	@staticmethod
    317 	def filter_printer_function_no_newline (filter_callback):
    318 		def printer (f):
    319 			for line in filter_callback (f):
    320 				sys.stdout.writelines ([line])
    321 		return printer
    322 
    323 
    324 class Ngram:
    325 
    326 	@staticmethod
    327 	def generator (n):
    328 
    329 		def gen (f):
    330 			l = []
    331 			for x in f:
    332 				l.append (x)
    333 				if len (l) == n:
    334 					yield tuple (l)
    335 					l[:1] = []
    336 
    337 		gen.n = n
    338 		return gen
    339 
    340 
    341 class UtilMains:
    342 
    343 	@staticmethod
    344 	def process_multiple_files (callback, mnemonic = "FILE"):
    345 
    346 		if "--help" in sys.argv:
    347 			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
    348 			sys.exit (1)
    349 
    350 		try:
    351 			files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
    352 			for s in files:
    353 				callback (FileHelpers.open_file_or_stdin (s))
    354 		except IOError as e:
    355 			if e.errno != errno.EPIPE:
    356 				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
    357 				sys.exit (1)
    358 
    359 	@staticmethod
    360 	def process_multiple_args (callback, mnemonic):
    361 
    362 		if len (sys.argv) == 1 or "--help" in sys.argv:
    363 			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
    364 			sys.exit (1)
    365 
    366 		try:
    367 			for s in sys.argv[1:]:
    368 				callback (s)
    369 		except IOError as e:
    370 			if e.errno != errno.EPIPE:
    371 				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
    372 				sys.exit (1)
    373 
    374 	@staticmethod
    375 	def filter_multiple_strings_or_stdin (callback, mnemonic, \
    376 					      separator = " ", \
    377 					      concat_separator = False):
    378 
    379 		if "--help" in sys.argv:
    380 			print "Usage:\n  %s %s...\nor:\n  %s\n\nWhen called with no arguments, input is read from standard input." \
    381 			      % (sys.argv[0], mnemonic, sys.argv[0])
    382 			sys.exit (1)
    383 
    384 		try:
    385 			if len (sys.argv) == 1:
    386 				while (1):
    387 					line = sys.stdin.readline ()
    388 					if not len (line):
    389 						break
    390 					if line[-1] == '\n':
    391 						line = line[:-1]
    392 					print callback (line)
    393 			else:
    394 				args = sys.argv[1:]
    395 				if concat_separator != False:
    396 					args = [concat_separator.join (args)]
    397 				print separator.join (callback (x) for x in (args))
    398 		except IOError as e:
    399 			if e.errno != errno.EPIPE:
    400 				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
    401 				sys.exit (1)
    402 
    403 
    404 class Unicode:
    405 
    406 	@staticmethod
    407 	def decode (s):
    408 		return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')
    409 
    410 	@staticmethod
    411 	def parse (s):
    412 		s = re.sub (r"0[xX]", " ", s)
    413 		s = re.sub (r"[<+>,;&#\\xXuU\n	]", " ", s)
    414 		return [int (x, 16) for x in s.split (' ') if len (x)]
    415 
    416 	@staticmethod
    417 	def encode (s):
    418 		return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
    419 
    420 	shorthands = {
    421 		"ZERO WIDTH NON-JOINER": "ZWNJ",
    422 		"ZERO WIDTH JOINER": "ZWJ",
    423 		"NARROW NO-BREAK SPACE": "NNBSP",
    424 		"COMBINING GRAPHEME JOINER": "CGJ",
    425 		"LEFT-TO-RIGHT MARK": "LRM",
    426 		"RIGHT-TO-LEFT MARK": "RLM",
    427 		"LEFT-TO-RIGHT EMBEDDING": "LRE",
    428 		"RIGHT-TO-LEFT EMBEDDING": "RLE",
    429 		"POP DIRECTIONAL FORMATTING": "PDF",
    430 		"LEFT-TO-RIGHT OVERRIDE": "LRO",
    431 		"RIGHT-TO-LEFT OVERRIDE": "RLO",
    432 	}
    433 
    434 	@staticmethod
    435 	def pretty_name (u):
    436 		try:
    437 			s = unicodedata.name (u)
    438 		except ValueError:
    439 			return "XXX"
    440 		s = re.sub (".* LETTER ", "", s)
    441 		s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
    442 		s = re.sub (".* SIGN ", "", s)
    443 		s = re.sub (".* COMBINING ", "", s)
    444 		if re.match (".* VIRAMA", s):
    445 			s = "HALANT"
    446 		if s in Unicode.shorthands:
    447 			s = Unicode.shorthands[s]
    448 		return s
    449 
    450 	@staticmethod
    451 	def pretty_names (s):
    452 		s = re.sub (r"[<+>\\uU]", " ", s)
    453 		s = re.sub (r"0[xX]", " ", s)
    454 		s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
    455 		return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
    456 
    457 
    458 class FileHelpers:
    459 
    460 	@staticmethod
    461 	def open_file_or_stdin (f):
    462 		if f == '-':
    463 			return sys.stdin
    464 		return file (f)
    465 
    466 
    467 class Manifest:
    468 
    469 	@staticmethod
    470 	def read (s, strict = True):
    471 
    472 		if not os.path.exists (s):
    473 			if strict:
    474 				print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
    475 				sys.exit (1)
    476 			return
    477 
    478 		s = os.path.normpath (s)
    479 
    480 		if os.path.isdir (s):
    481 
    482 			try:
    483 				m = file (os.path.join (s, "MANIFEST"))
    484 				items = [x.strip () for x in m.readlines ()]
    485 				for f in items:
    486 					for p in Manifest.read (os.path.join (s, f)):
    487 						yield p
    488 			except IOError:
    489 				if strict:
    490 					print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
    491 					sys.exit (1)
    492 				return
    493 		else:
    494 			yield s
    495 
    496 	@staticmethod
    497 	def update_recursive (s):
    498 
    499 		for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
    500 
    501 			for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
    502 				if f in dirnames:
    503 					dirnames.remove (f)
    504 				if f in filenames:
    505 					filenames.remove (f)
    506 			dirnames.sort ()
    507 			filenames.sort ()
    508 			ms = os.path.join (dirpath, "MANIFEST")
    509 			print "  GEN    %s" % ms
    510 			m = open (ms, "w")
    511 			for f in filenames:
    512 				print >> m, f
    513 			for f in dirnames:
    514 				print >> m, f
    515 			for f in dirnames:
    516 				Manifest.update_recursive (os.path.join (dirpath, f))
    517 
    518 if __name__ == '__main__':
    519 	pass
    520