普通文本  |  520行  |  12.6 KB

#!/usr/bin/python

import sys, os, re, difflib, unicodedata, errno, cgi
from itertools import *

diff_symbols = "-+=*&^%$#@!~/"
diff_colors = ['red', 'green', 'blue']

class ColorFormatter:

	class Null:
		@staticmethod
		def start_color (c): return ''
		@staticmethod
		def end_color (): return ''
		@staticmethod
		def escape (s): return s
		@staticmethod
		def newline (): return '\n'

	class ANSI:
		@staticmethod
		def start_color (c):
			return {
				'red': '\033[41;37;1m',
				'green': '\033[42;37;1m',
				'blue': '\033[44;37;1m',
			}[c]
		@staticmethod
		def end_color ():
			return '\033[m'
		@staticmethod
		def escape (s): return s
		@staticmethod
		def newline (): return '\n'

	class HTML:
		@staticmethod
		def start_color (c):
			return '<span style="background:%s">' % c
		@staticmethod
		def end_color ():
			return '</span>'
		@staticmethod
		def escape (s): return cgi.escape (s)
		@staticmethod
		def newline (): return '<br/>\n'

	@staticmethod
	def Auto (argv = [], out = sys.stdout):
		format = ColorFormatter.ANSI
		if "--format" in argv:
			argv.remove ("--format")
			format = ColorFormatter.ANSI
		if "--format=ansi" in argv:
			argv.remove ("--format=ansi")
			format = ColorFormatter.ANSI
		if "--format=html" in argv:
			argv.remove ("--format=html")
			format = ColorFormatter.HTML
		if "--no-format" in argv:
			argv.remove ("--no-format")
			format = ColorFormatter.Null
		return format


class DiffColorizer:

	diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')

	def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
		self.formatter = formatter
		self.colors = colors
		self.symbols = symbols

	def colorize_lines (self, lines):
		lines = (l if l else '' for l in lines)
		ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
		oo = ["",""]
		st = [False, False]
		for l in difflib.Differ().compare (*ss):
			if l[0] == '?':
				continue
			if l[0] == ' ':
				for i in range(2):
					if st[i]:
						oo[i] += self.formatter.end_color ()
						st[i] = False
				oo = [o + self.formatter.escape (l[2:]) for o in oo]
				continue
			if l[0] in self.symbols:
				i = self.symbols.index (l[0])
				if not st[i]:
					oo[i] += self.formatter.start_color (self.colors[i])
					st[i] = True
				oo[i] += self.formatter.escape (l[2:])
				continue
		for i in range(2):
			if st[i]:
				oo[i] += self.formatter.end_color ()
				st[i] = False
		oo = [o.replace ('\n', '') for o in oo]
		return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]

	def colorize_diff (self, f):
		lines = [None, None]
		for l in f:
			if l[0] not in self.symbols:
				yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
				continue
			i = self.symbols.index (l[0])
			if lines[i]:
				# Flush
				for line in self.colorize_lines (lines):
					yield line
				lines = [None, None]
			lines[i] = l[1:]
			if (all (lines)):
				# Flush
				for line in self.colorize_lines (lines):
					yield line
				lines = [None, None]
		if (any (lines)):
			# Flush
			for line in self.colorize_lines (lines):
				yield line


class ZipDiffer:

	@staticmethod
	def diff_files (files, symbols=diff_symbols):
		files = tuple (files) # in case it's a generator, copy it
		try:
			for lines in izip_longest (*files):
				if all (lines[0] == line for line in lines[1:]):
					sys.stdout.writelines ([" ", lines[0]])
					continue

				for i, l in enumerate (lines):
					if l:
						sys.stdout.writelines ([symbols[i], l])
		except IOError as e:
			if e.errno != errno.EPIPE:
				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
				sys.exit (1)


class DiffFilters:

	@staticmethod
	def filter_failures (f):
		for key, lines in DiffHelpers.separate_test_cases (f):
			lines = list (lines)
			if not DiffHelpers.test_passed (lines):
				for l in lines: yield l

class Stat:

	def __init__ (self):
		self.count = 0
		self.freq = 0

	def add (self, test):
		self.count += 1
		self.freq += test.freq

class Stats:

	def __init__ (self):
		self.passed = Stat ()
		self.failed = Stat ()
		self.total  = Stat ()

	def add (self, test):
		self.total.add (test)
		if test.passed:
			self.passed.add (test)
		else:
			self.failed.add (test)

	def mean (self):
		return float (self.passed.count) / self.total.count

	def variance (self):
		return (float (self.passed.count) / self.total.count) * \
		       (float (self.failed.count) / self.total.count)

	def stddev (self):
		return self.variance () ** .5

	def zscore (self, population):
		"""Calculate the standard score.
		   Population is the Stats for population.
		   Self is Stats for sample.
		   Returns larger absolute value if sample is highly unlikely to be random.
		   Anything outside of -3..+3 is very unlikely to be random.
		   See: http://en.wikipedia.org/wiki/Standard_score"""

		return (self.mean () - population.mean ()) / population.stddev ()




class DiffSinks:

	@staticmethod
	def print_stat (f):
		passed = 0
		failed = 0
		# XXX port to Stats, but that would really slow us down here
		for key, lines in DiffHelpers.separate_test_cases (f):
			if DiffHelpers.test_passed (lines):
				passed += 1
			else:
				failed += 1
		total = passed + failed
		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)

	@staticmethod
	def print_ngrams (f, ns=(1,2,3)):
		gens = tuple (Ngram.generator (n) for n in ns)
		allstats = Stats ()
		allgrams = {}
		for key, lines in DiffHelpers.separate_test_cases (f):
			test = Test (lines)
			allstats.add (test)

			for gen in gens:
				for ngram in gen (test.unicodes):
					if ngram not in allgrams:
						allgrams[ngram] = Stats ()
					allgrams[ngram].add (test)

		importantgrams = {}
		for ngram, stats in allgrams.iteritems ():
			if stats.failed.count >= 30: # for statistical reasons
				importantgrams[ngram] = stats
		allgrams = importantgrams
		del importantgrams

		for ngram, stats in allgrams.iteritems ():
			print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))



class Test:

	def __init__ (self, lines):
		self.freq = 1
		self.passed = True
		self.identifier = None
		self.text = None
		self.unicodes = None
		self.glyphs = None
		for l in lines:
			symbol = l[0]
			if symbol != ' ':
				self.passed = False
			i = 1
			if ':' in l:
				i = l.index (':')
				if not self.identifier:
					self.identifier = l[1:i]
				i = i + 2 # Skip colon and space
			j = -1
			if l[j] == '\n':
				j -= 1
			brackets = l[i] + l[j]
			l = l[i+1:-2]
			if brackets == '()':
				self.text = l
			elif brackets == '<>':
				self.unicodes = Unicode.parse (l)
			elif brackets == '[]':
				# XXX we don't handle failed tests here
				self.glyphs = l


class DiffHelpers:

	@staticmethod
	def separate_test_cases (f):
		'''Reads lines from f, and if the lines have identifiers, ie.
		   have a colon character, groups them by identifier,
		   yielding lists of all lines with the same identifier.'''

		def identifier (l):
			if ':' in l[1:]:
				return l[1:l.index (':')]
			return l
		return groupby (f, key=identifier)

	@staticmethod
	def test_passed (lines):
		lines = list (lines)
		# XXX This is a hack, but does the job for now.
		if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
		if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
		if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
		if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
		if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
		if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
		return all (l[0] == ' ' for l in lines)


class FilterHelpers:

	@staticmethod
	def filter_printer_function (filter_callback):
		def printer (f):
			for line in filter_callback (f):
				print line
		return printer

	@staticmethod
	def filter_printer_function_no_newline (filter_callback):
		def printer (f):
			for line in filter_callback (f):
				sys.stdout.writelines ([line])
		return printer


class Ngram:

	@staticmethod
	def generator (n):

		def gen (f):
			l = []
			for x in f:
				l.append (x)
				if len (l) == n:
					yield tuple (l)
					l[:1] = []

		gen.n = n
		return gen


class UtilMains:

	@staticmethod
	def process_multiple_files (callback, mnemonic = "FILE"):

		if "--help" in sys.argv:
			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
			sys.exit (1)

		try:
			files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
			for s in files:
				callback (FileHelpers.open_file_or_stdin (s))
		except IOError as e:
			if e.errno != errno.EPIPE:
				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
				sys.exit (1)

	@staticmethod
	def process_multiple_args (callback, mnemonic):

		if len (sys.argv) == 1 or "--help" in sys.argv:
			print "Usage: %s %s..." % (sys.argv[0], mnemonic)
			sys.exit (1)

		try:
			for s in sys.argv[1:]:
				callback (s)
		except IOError as e:
			if e.errno != errno.EPIPE:
				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
				sys.exit (1)

	@staticmethod
	def filter_multiple_strings_or_stdin (callback, mnemonic, \
					      separator = " ", \
					      concat_separator = False):

		if "--help" in sys.argv:
			print "Usage:\n  %s %s...\nor:\n  %s\n\nWhen called with no arguments, input is read from standard input." \
			      % (sys.argv[0], mnemonic, sys.argv[0])
			sys.exit (1)

		try:
			if len (sys.argv) == 1:
				while (1):
					line = sys.stdin.readline ()
					if not len (line):
						break
					if line[-1] == '\n':
						line = line[:-1]
					print callback (line)
			else:
				args = sys.argv[1:]
				if concat_separator != False:
					args = [concat_separator.join (args)]
				print separator.join (callback (x) for x in (args))
		except IOError as e:
			if e.errno != errno.EPIPE:
				print >> sys.stderr, "%s: %s: %s" % (sys.argv[0], e.filename, e.strerror)
				sys.exit (1)


class Unicode:

	@staticmethod
	def decode (s):
		return u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8')

	@staticmethod
	def parse (s):
		s = re.sub (r"0[xX]", " ", s)
		s = re.sub (r"[<+>,;&#\\xXuU\n	]", " ", s)
		return [int (x, 16) for x in s.split ()]

	@staticmethod
	def encode (s):
		return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')

	shorthands = {
		"ZERO WIDTH NON-JOINER": "ZWNJ",
		"ZERO WIDTH JOINER": "ZWJ",
		"NARROW NO-BREAK SPACE": "NNBSP",
		"COMBINING GRAPHEME JOINER": "CGJ",
		"LEFT-TO-RIGHT MARK": "LRM",
		"RIGHT-TO-LEFT MARK": "RLM",
		"LEFT-TO-RIGHT EMBEDDING": "LRE",
		"RIGHT-TO-LEFT EMBEDDING": "RLE",
		"POP DIRECTIONAL FORMATTING": "PDF",
		"LEFT-TO-RIGHT OVERRIDE": "LRO",
		"RIGHT-TO-LEFT OVERRIDE": "RLO",
	}

	@staticmethod
	def pretty_name (u):
		try:
			s = unicodedata.name (u)
		except ValueError:
			return "XXX"
		s = re.sub (".* LETTER ", "", s)
		s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
		s = re.sub (".* SIGN ", "", s)
		s = re.sub (".* COMBINING ", "", s)
		if re.match (".* VIRAMA", s):
			s = "HALANT"
		if s in Unicode.shorthands:
			s = Unicode.shorthands[s]
		return s

	@staticmethod
	def pretty_names (s):
		s = re.sub (r"[<+>\\uU]", " ", s)
		s = re.sub (r"0[xX]", " ", s)
		s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
		return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')


class FileHelpers:

	@staticmethod
	def open_file_or_stdin (f):
		if f == '-':
			return sys.stdin
		return file (f)


class Manifest:

	@staticmethod
	def read (s, strict = True):

		if not os.path.exists (s):
			if strict:
				print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], s)
				sys.exit (1)
			return

		s = os.path.normpath (s)

		if os.path.isdir (s):

			try:
				m = file (os.path.join (s, "MANIFEST"))
				items = [x.strip () for x in m.readlines ()]
				for f in items:
					for p in Manifest.read (os.path.join (s, f)):
						yield p
			except IOError:
				if strict:
					print >> sys.stderr, "%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST"))
					sys.exit (1)
				return
		else:
			yield s

	@staticmethod
	def update_recursive (s):

		for dirpath, dirnames, filenames in os.walk (s, followlinks=True):

			for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
				if f in dirnames:
					dirnames.remove (f)
				if f in filenames:
					filenames.remove (f)
			dirnames.sort ()
			filenames.sort ()
			ms = os.path.join (dirpath, "MANIFEST")
			print "  GEN    %s" % ms
			m = open (ms, "w")
			for f in filenames:
				print >> m, f
			for f in dirnames:
				print >> m, f
			for f in dirnames:
				Manifest.update_recursive (os.path.join (dirpath, f))

if __name__ == '__main__':
	pass