Home | History | Annotate | Download | only in gtk
      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 #
      4 # compose-parse.py, version 1.3
      5 #
      6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
      7 # the script produces statistics and information about the whole process, run with --help for more.
      8 #
      9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
     10 #
     11 # Complain to Simos Xenitellis (simos (at] gnome.org, http://simos.info/blog) for this craft.
     12 
     13 from re			import findall, match, split, sub
     14 from string		import atoi
     15 from unicodedata	import normalize
     16 from urllib 		import urlretrieve
     17 from os.path		import isfile, getsize
     18 from copy 		import copy
     19 
     20 import sys
     21 import getopt
     22 
     23 # We grab files off the web, left and right.
     24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
     25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
     26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
     27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
     28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
     29 
     30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
     31 # we might produce some tables with size 2 and some with size 4.
     32 SIZEOFINT = 2
     33 
     34 # Current max compose sequence length; in case it gets increased.
     35 WIDTHOFCOMPOSETABLE = 5
     36 
     37 keysymdatabase = {}
     38 keysymunicodedatabase = {}
     39 unicodedatabase = {}
     40 
     41 headerfile_start = """/* GTK - The GIMP Tool Kit
     42  * Copyright (C) 2007, 2008 GNOME Foundation
     43  *
     44  * This library is free software; you can redistribute it and/or
     45  * modify it under the terms of the GNU Lesser General Public
     46  * License as published by the Free Software Foundation; either
     47  * version 2 of the License, or (at your option) any later version.
     48  *
     49  * This library is distributed in the hope that it will be useful,
     50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     52  * Lesser General Public License for more details.
     53  *
     54  * You should have received a copy of the GNU Lesser General Public
     55  * License along with this library; if not, write to the
     56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     57  * Boston, MA 02111-1307, USA.
     58  */
     59 
     60 /*
     61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
     62  * using the input files
     63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
     64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
     65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
     66  *
     67  * This table is optimised for space and requires special handling to access the content.
     68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
     69  * 
     70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
     71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
     72  */
     73 
     74 /*
     75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
     76  * file for a list of people on the GTK+ Team.  See the ChangeLog
     77  * files for a list of changes.  These files are distributed with
     78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
     79  */
     80 
     81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
     82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
     83 
     84 /* === These are the original comments of the file; we keep for historical purposes ===
     85  *
     86  * The following table was generated from the X compose tables include with
     87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor (at] redhat.com>
     88  * to obtain the relevant perl scripts.
     89  *
     90  * The following compose letter letter sequences confliced
     91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
     92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
     93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
     94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
     95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
     96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
     97  *
     98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
     99  *   spanish. atilde and otilde are used at least for Portuguese ]
    100  *
    101  *   at and Aring; resolved to Aring                                          [ AA ]
    102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
    103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
    104  *
    105  * This probably should be resolved by first checking an additional set of compose tables
    106  * that depend on the locale or selected input method.
    107  */
    108 
    109 static const guint16 gtk_compose_seqs_compact[] = {"""
    110 
    111 headerfile_end = """};
    112 
    113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
    114 """
    115 
    116 def stringtohex(str): return atoi(str, 16)
    117 
    118 def factorial(n): 
    119 	if n <= 1:
    120 		return 1
    121 	else:
    122 		return n * factorial(n-1)
    123 
    124 def uniq(*args) :
    125 	""" Performs a uniq operation on a list or lists """
    126     	theInputList = []
    127     	for theList in args:
    128     	   theInputList += theList
    129     	theFinalList = []
    130     	for elem in theInputList:
    131 		if elem not in theFinalList:
    132           		theFinalList.append(elem)
    133     	return theFinalList
    134 
    135 
    136 
    137 def all_permutations(seq):
    138 	""" Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
    139 	""" Produces all permutations of the items of a list """
    140     	if len(seq) <=1:
    141     	    yield seq
    142     	else:
    143     	    for perm in all_permutations(seq[1:]):
    144     	        for i in range(len(perm)+1):
    145     	            #nb str[0:1] works in both string and list contexts
    146         	        yield perm[:i] + seq[0:1] + perm[i:]
    147 
    148 def usage():
    149 	print """compose-parse available parameters:
    150 	-h, --help		this craft
    151 	-s, --statistics	show overall statistics (both algorithmic, non-algorithmic)
    152 	-a, --algorithmic	show sequences saved with algorithmic optimisation
    153 	-g, --gtk		show entries that go to GTK+
    154 	-u, --unicodedatatxt	show compose sequences derived from UnicodeData.txt (from unicode.org)
    155 	-v, --verbose		show verbose output
    156         -p, --plane1		show plane1 compose sequences
    157 	-n, --numeric		when used with --gtk, create file with numeric values only
    158 	-e, --gtk-expanded	when used with --gtk, create file that repeats first column; not usable in GTK+
    159 	--all-sequences		when used with --gtk, create file with entries rejected by default
    160 	Default is to show statistics.
    161 	"""
    162 
    163 try: 
    164 	opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
    165 		"stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
    166 except: 
    167 	usage()
    168 	sys.exit(2)
    169 
    170 opt_statistics = False
    171 opt_algorithmic = False
    172 opt_gtk = False
    173 opt_unicodedatatxt = False
    174 opt_verbose = False
    175 opt_plane1 = False
    176 opt_numeric = False
    177 opt_gtkexpanded = False
    178 opt_allsequences = False
    179 
    180 for o, a in opts:
    181 	if o in ("-h", "--help"):
    182 		usage()
    183 		sys.exit()
    184 	if o in ("-s", "--statistics"):
    185 		opt_statistics = True
    186 	if o in ("-a", "--algorithmic"):
    187 		opt_algorithmic = True
    188 	if o in ("-g", "--gtk"):
    189 		opt_gtk = True	
    190 	if o in ("-u", "--unicodedatatxt"):
    191 		opt_unicodedatatxt = True
    192 	if o in ("-v", "--verbose"):
    193 		opt_verbose = True
    194 	if o in ("-p", "--plane1"):
    195 		opt_plane1 = True
    196 	if o in ("-n", "--numeric"):
    197 		opt_numeric = True
    198 	if o in ("-e", "--gtk-expanded"):
    199 		opt_gtkexpanded = True
    200 	if o == "--all-sequences":
    201 		opt_allsequences = True
    202 
    203 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
    204 	opt_statistics = True
    205 
    206 def download_hook(blocks_transferred, block_size, file_size):
    207 	""" A download hook to provide some feedback when downloading """
    208 	if blocks_transferred == 0:
    209 		if file_size > 0:
    210 			if opt_verbose:
    211 				print "Downloading", file_size, "bytes: ",
    212 		else:	
    213 			if opt_verbose:
    214 				print "Downloading: ",
    215 	sys.stdout.write('#')
    216 	sys.stdout.flush()
    217 
    218 
    219 def download_file(url):
    220 	""" Downloads a file provided a URL. Returns the filename. """
    221 	""" Borks on failure """
    222 	localfilename = url.split('/')[-1]
    223         if not isfile(localfilename) or getsize(localfilename) <= 0:
    224 		if opt_verbose:
    225 			print "Downloading ", url, "..."
    226 		try: 
    227 			urlretrieve(url, localfilename, download_hook)
    228 		except IOError, (errno, strerror):
    229 			print "I/O error(%s): %s" % (errno, strerror)
    230 			sys.exit(-1)
    231 		except:
    232 			print "Unexpected error: ", sys.exc_info()[0]
    233 			sys.exit(-1)
    234 		print " done."
    235         else:
    236 		if opt_verbose:
    237                 	print "Using cached file for ", url
    238 	return localfilename
    239 
    240 def process_gdkkeysymsh():
    241 	""" Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
    242 	""" Fills up keysymdb with contents """
    243 	filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
    244 	try: 
    245 		gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
    246 	except IOError, (errno, strerror):
    247 		print "I/O error(%s): %s" % (errno, strerror)
    248 		sys.exit(-1)
    249 	except:
    250 		print "Unexpected error: ", sys.exc_info()[0]
    251 		sys.exit(-1)
    252 
    253 	""" Parse the gdkkeysyms.h file and place contents in  keysymdb """
    254 	linenum_gdkkeysymsh = 0
    255 	keysymdb = {}
    256 	for line in gdkkeysymsh.readlines():
    257 		linenum_gdkkeysymsh += 1
    258 		line = line.strip()
    259 		if line == "" or not match('^#define GDK_KEY_', line):
    260 			continue
    261 		components = split('\s+', line)
    262 		if len(components) < 3:
    263 			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
    264 			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
    265 			print "Was expecting 3 items in the line"
    266 			sys.exit(-1)
    267 		if not match('^GDK_KEY_', components[1]):
    268 			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
    269 			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
    270 			print "Was expecting a keysym starting with GDK_KEY_"
    271 			sys.exit(-1)
    272 		if match('^0x[0-9a-fA-F]+$', components[2]):
    273 			unival = long(components[2][2:], 16)
    274 			if unival == 0:
    275 				continue
    276 			keysymdb[components[1][8:]] = unival
    277 		else:
    278 			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
    279 			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
    280 			print "Was expecting a hexadecimal number at the end of the line"
    281 			sys.exit(-1)
    282 	gdkkeysymsh.close()
    283 
    284 	""" Patch up the keysymdb with some of our own stuff """
    285 
    286 	""" This is for a missing keysym from the currently upstream file """
    287 	#keysymdb['dead_stroke'] = 0x338
    288 
    289 	""" This is for a missing keysym from the currently upstream file """
    290 	###keysymdb['dead_belowring'] = 0x323
    291 	###keysymdb['dead_belowmacron'] = 0x331
    292 	###keysymdb['dead_belowcircumflex'] = 0x32d
    293 	###keysymdb['dead_belowtilde'] = 0x330
    294 	###keysymdb['dead_belowbreve'] = 0x32e
    295 	###keysymdb['dead_belowdiaeresis'] = 0x324
    296 
    297 	""" This is^Wwas preferential treatment for Greek """
    298 	# keysymdb['dead_tilde'] = 0x342  		
    299 	""" This is^was preferential treatment for Greek """
    300 	#keysymdb['combining_tilde'] = 0x342	
    301 
    302 	""" Fixing VoidSymbol """
    303 	keysymdb['VoidSymbol'] = 0xFFFF
    304 
    305 	return keysymdb
    306 
    307 def process_keysymstxt():
    308 	""" Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
    309 	""" This file keeps a record between keysyms <-> unicode chars """
    310 	filename_keysymstxt = download_file(URL_KEYSYMSTXT)
    311 	try: 
    312 		keysymstxt = open(filename_keysymstxt, 'r')
    313 	except IOError, (errno, strerror):
    314 		print "I/O error(%s): %s" % (errno, strerror)
    315 		sys.exit(-1)
    316 	except:
    317 		print "Unexpected error: ", sys.exc_info()[0]
    318 		sys.exit(-1)
    319 
    320 	""" Parse the keysyms.txt file and place content in  keysymdb """
    321 	linenum_keysymstxt = 0
    322 	keysymdb = {}
    323 	for line in keysymstxt.readlines():
    324 		linenum_keysymstxt += 1
    325 		line = line.strip()
    326 		if line == "" or match('^#', line):
    327 			continue
    328 		components = split('\s+', line)
    329 		if len(components) < 5:
    330 			print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
    331 			% {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
    332 			print "Was expecting 5 items in the line"
    333 			sys.exit(-1)
    334 		if match('^U[0-9a-fA-F]+$', components[1]):
    335 			unival = long(components[1][1:], 16)
    336 		if unival == 0:
    337 			continue
    338 		keysymdb[components[4]] = unival
    339 	keysymstxt.close()
    340 
    341 	""" Patch up the keysymdb with some of our own stuff """
    342 	""" This is for a missing keysym from the currently upstream file """
    343 	###keysymdb['dead_belowring'] = 0x323
    344 	###keysymdb['dead_belowmacron'] = 0x331
    345 	###keysymdb['dead_belowcircumflex'] = 0x32d
    346 	###keysymdb['dead_belowtilde'] = 0x330
    347 	###keysymdb['dead_belowbreve'] = 0x32e
    348 	###keysymdb['dead_belowdiaeresis'] = 0x324
    349 
    350 	""" This is preferential treatment for Greek """
    351 	""" => we get more savings if used for Greek """
    352 	# keysymdb['dead_tilde'] = 0x342  		
    353 	""" This is preferential treatment for Greek """
    354 	# keysymdb['combining_tilde'] = 0x342	
    355 
    356 	""" This is for a missing keysym from Markus Kuhn's db """
    357 	keysymdb['dead_stroke'] = 0x338
    358 	""" This is for a missing keysym from Markus Kuhn's db """
    359 	keysymdb['Oslash'] = 0x0d8		
    360 	""" This is for a missing keysym from Markus Kuhn's db """
    361 	keysymdb['Ssharp'] = 0x1e9e
    362 
    363 	""" This is for a missing (recently added) keysym """
    364 	keysymdb['dead_psili'] = 0x313		
    365 	""" This is for a missing (recently added) keysym """
    366 	keysymdb['dead_dasia'] = 0x314		
    367 
    368 	""" Allows to import Multi_key sequences """
    369 	keysymdb['Multi_key'] = 0xff20
    370 
    371         keysymdb['zerosubscript'] = 0x2080
    372         keysymdb['onesubscript'] = 0x2081
    373         keysymdb['twosubscript'] = 0x2082
    374         keysymdb['threesubscript'] = 0x2083
    375         keysymdb['foursubscript'] = 0x2084
    376         keysymdb['fivesubscript'] = 0x2085
    377         keysymdb['sixsubscript'] = 0x2086
    378         keysymdb['sevensubscript'] = 0x2087
    379         keysymdb['eightsubscript'] = 0x2088
    380         keysymdb['ninesubscript'] = 0x2089
    381         keysymdb['dead_doublegrave'] = 0x030F
    382         keysymdb['dead_invertedbreve'] = 0x0311
    383 
    384 	return keysymdb
    385 
    386 def keysymvalue(keysym, file = "n/a", linenum = 0):
    387 	""" Extracts a value from the keysym """
    388 	""" Find the value of keysym, using the data from keysyms """
    389 	""" Use file and linenum to when reporting errors """
    390 	if keysym == "":
    391 		return 0
    392        	if keysymdatabase.has_key(keysym):
    393                	return keysymdatabase[keysym]
    394        	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
    395                	return atoi(keysym[1:], 16)
    396        	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
    397 		return atoi(keysym[2:], 16)
    398 	else:
    399         	print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
    400                	#return -1
    401 		sys.exit(-1)
    402 
    403 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
    404 	""" Extracts a value from the keysym """
    405 	""" Find the value of keysym, using the data from keysyms """
    406 	""" Use file and linenum to when reporting errors """
    407 	if keysym == "":
    408 		return 0
    409        	if keysymunicodedatabase.has_key(keysym):
    410                	return keysymunicodedatabase[keysym]
    411        	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
    412                	return atoi(keysym[1:], 16)
    413        	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
    414 		return atoi(keysym[2:], 16)
    415 	else:
    416         	print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
    417                	sys.exit(-1)
    418 
    419 def rename_combining(seq):
    420 	filtered_sequence = []
    421 	for ks in seq:
    422 		if findall('^combining_', ks):
    423 			ks = sub('^combining_', 'dead_', ks)
    424                 if ks == 'dead_double_grave':
    425                         ks = 'dead_doublegrave'
    426                 if ks == 'dead_inverted_breve':
    427                         ks = 'dead_invertedbreve'
    428 		filtered_sequence.append(ks)
    429 	return filtered_sequence
    430 
    431 
    432 keysymunicodedatabase = process_keysymstxt()
    433 keysymdatabase = process_gdkkeysymsh()
    434 
    435 """ Grab and open the compose file from upstream """
    436 filename_compose = download_file(URL_COMPOSE)
    437 try: 
    438 	composefile = open(filename_compose, 'r')
    439 except IOError, (errno, strerror):
    440 	print "I/O error(%s): %s" % (errno, strerror)
    441 	sys.exit(-1)
    442 except:
    443 	print "Unexpected error: ", sys.exc_info()[0]
    444 	sys.exit(-1)
    445 
    446 """ Look if there is a lookaside (supplementary) compose file in the current
    447     directory, and if so, open, then merge with upstream Compose file.
    448 """
    449 xorg_compose_sequences_raw = []
    450 for seq in composefile.readlines():
    451         xorg_compose_sequences_raw.append(seq)
    452 
    453 try:
    454         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
    455         for seq in composefile_lookaside.readlines():
    456                 xorg_compose_sequences_raw.append(seq)
    457 except IOError, (errno, strerror):
    458         if opt_verbose:
    459                 print "I/O error(%s): %s" % (errno, strerror)
    460                 print "Did not find lookaside compose file. Continuing..."
    461 except:
    462         print "Unexpected error: ", sys.exc_info()[0]
    463         sys.exit(-1)
    464 
    465 """ Parse the compose file in  xorg_compose_sequences"""
    466 xorg_compose_sequences = []
    467 xorg_compose_sequences_algorithmic = []
    468 linenum_compose = 0
    469 comment_nest_depth = 0
    470 for line in xorg_compose_sequences_raw:
    471 	linenum_compose += 1
    472 	line = line.strip()
    473 	if match("^XCOMM", line) or match("^#", line):
    474 		continue
    475 
    476 	line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
    477 
    478 	comment_start = line.find("/*")
    479 
    480 	if comment_start >= 0:
    481 		if comment_nest_depth == 0:
    482 			line = line[:comment_start]
    483 		else:
    484 			line = ""
    485 
    486 		comment_nest_depth += 1
    487 	else:
    488 		comment_end = line.find("*/")
    489 
    490 		if comment_end >= 0:
    491 			comment_nest_depth -= 1
    492 
    493 		if comment_nest_depth < 0:
    494 			print "Invalid comment %(linenum_compose)d in %(filename)s: \
    495 			Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
    496 			exit(-1)
    497 
    498 		if comment_nest_depth > 0:
    499 			line = ""
    500 		else:
    501 			line = line[comment_end + 2:]
    502 
    503 	if line is "":
    504 		continue
    505 
    506 	#line = line[:-1]
    507 	components = split(':', line)
    508 	if len(components) != 2:
    509 		print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
    510 		/value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
    511 		exit(-1)
    512 	(seq, val ) = split(':', line)
    513 	seq = seq.strip()
    514 	val = val.strip()
    515 	raw_sequence = findall('\w+', seq)
    516 	values = split('\s+', val)
    517 	unichar_temp = split('"', values[0])
    518 	unichar = unichar_temp[1]
    519 	if len(values) == 1:
    520 		continue
    521 	codepointstr = values[1]
    522 	if values[1] == '#':
    523 		# No codepoints that are >1 characters yet.
    524 		continue
    525 	if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
    526 		raw_sequence[0] = '0x' + raw_sequence[0][1:]
    527 	if  match('^U[0-9a-fA-F]+$', codepointstr):
    528 		codepoint = long(codepointstr[1:], 16)
    529 	elif keysymunicodedatabase.has_key(codepointstr):
    530 		#if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
    531 			#print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
    532 			#print raw_sequence, codepointstr
    533 		codepoint = keysymunicodedatabase[codepointstr]
    534 	else:
    535 		print
    536 		print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
    537 		 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
    538 		exit(-1)
    539 	sequence = rename_combining(raw_sequence)
    540 	reject_this = False
    541 	for i in sequence:
    542 		if keysymvalue(i) > 0xFFFF:
    543 			reject_this = True
    544 			if opt_plane1:
    545 				print sequence
    546 			break
    547 		if keysymvalue(i) < 0:
    548 			reject_this = True
    549 			break
    550 	if reject_this:
    551 		continue
    552 	if "U0342" in sequence or \
    553 		"U0313" in sequence or \
    554 		"U0314" in sequence or \
    555 		"0x0313" in sequence or \
    556 		"0x0342" in sequence or \
    557 		"0x0314" in sequence:
    558 		continue
    559 	if "dead_belowring" in sequence or\
    560                 "dead_currency" in sequence or\
    561 		"dead_belowcomma" in sequence or\
    562 		"dead_belowmacron" in sequence or\
    563 		"dead_belowtilde" in sequence or\
    564 		"dead_belowbreve" in sequence or\
    565 		"dead_belowdiaeresis" in sequence or\
    566 		"dead_belowcircumflex" in sequence:
    567 		continue
    568 	#for i in range(len(sequence)):
    569 	#	if sequence[i] == "0x0342":
    570 	#		sequence[i] = "dead_tilde"
    571 	if "Multi_key" not in sequence:
    572 		""" Ignore for now >0xFFFF keysyms """
    573 		if codepoint < 0xFFFF:
    574 			original_sequence = copy(sequence)
    575 			stats_sequence = copy(sequence)
    576 			base = sequence.pop()
    577 			basechar = keysymvalue(base, filename_compose, linenum_compose)
    578 			
    579 			if basechar < 0xFFFF:
    580 				counter = 1
    581 				unisequence = []
    582 				not_normalised = True
    583 				skipping_this = False
    584 				for i in range(0, len(sequence)):
    585 					""" If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
    586 					    because of lack of dead_perispomeni (i.e. conflict)
    587 					"""
    588 					bc = basechar
    589 					"""if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
    590 						skipping_this = True
    591 						break
    592 					if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
    593 						skipping_this = True
    594 						break
    595 					if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
    596 						skipping_this = True
    597 						break
    598 					if sequence[-1] == "dead_psili":
    599 						sequence[i] = "dead_horn"
    600 					if sequence[-1] == "dead_dasia":
    601 						sequence[-1] = "dead_ogonek"
    602 					"""
    603 					unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
    604 					
    605 				if skipping_this:
    606 					unisequence = []
    607 				for perm in all_permutations(unisequence):
    608 					# print counter, original_sequence, unichr(basechar) + "".join(perm)
    609 					# print counter, map(unichr, perm)
    610 					normalized = normalize('NFC', unichr(basechar) + "".join(perm))
    611 					if len(normalized) == 1:
    612 						# print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
    613 						# % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
    614 						# print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
    615 						stats_sequence_data = map(keysymunicodevalue, stats_sequence)
    616 						stats_sequence_data.append(normalized)
    617 						xorg_compose_sequences_algorithmic.append(stats_sequence_data)
    618 						not_normalised = False
    619 						break;
    620 					counter += 1
    621 				if not_normalised or opt_allsequences:
    622 					original_sequence.append(codepoint)
    623 					xorg_compose_sequences.append(original_sequence)
    624 					""" print xorg_compose_sequences[-1] """
    625 					
    626 			else:
    627 				print "Error in base char !?!"
    628 				exit(-2)
    629 		else:
    630 			print "OVER", sequence
    631 			exit(-1)
    632 	else:
    633 		sequence.append(codepoint)
    634 		xorg_compose_sequences.append(sequence)
    635 		""" print xorg_compose_sequences[-1] """
    636 
    637 def sequence_cmp(x, y):
    638 	if keysymvalue(x[0]) > keysymvalue(y[0]):
    639 		return 1
    640 	elif keysymvalue(x[0]) < keysymvalue(y[0]):
    641 		return -1
    642 	elif len(x) > len(y):
    643 		return 1
    644 	elif len(x) < len(y):
    645 		return -1
    646 	elif keysymvalue(x[1]) > keysymvalue(y[1]):
    647 		return 1
    648 	elif keysymvalue(x[1]) < keysymvalue(y[1]):
    649 		return -1
    650 	elif len(x) < 4:
    651 		return 0
    652 	elif keysymvalue(x[2]) > keysymvalue(y[2]):
    653 		return 1
    654 	elif keysymvalue(x[2]) < keysymvalue(y[2]):
    655 		return -1
    656 	elif len(x) < 5:
    657 		return 0
    658 	elif keysymvalue(x[3]) > keysymvalue(y[3]):
    659 		return 1
    660 	elif keysymvalue(x[3]) < keysymvalue(y[3]):
    661 		return -1
    662 	elif len(x) < 6:
    663 		return 0
    664 	elif keysymvalue(x[4]) > keysymvalue(y[4]):
    665 		return 1
    666 	elif keysymvalue(x[4]) < keysymvalue(y[4]):
    667 		return -1
    668 	else:
    669 		return 0
    670 
    671 def sequence_unicode_cmp(x, y):
    672 	if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
    673 		return 1
    674 	elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
    675 		return -1
    676 	elif len(x) > len(y):
    677 		return 1
    678 	elif len(x) < len(y):
    679 		return -1
    680 	elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
    681 		return 1
    682 	elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
    683 		return -1
    684 	elif len(x) < 4:
    685 		return 0
    686 	elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
    687 		return 1
    688 	elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
    689 		return -1
    690 	elif len(x) < 5:
    691 		return 0
    692 	elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
    693 		return 1
    694 	elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
    695 		return -1
    696 	elif len(x) < 6:
    697 		return 0
    698 	elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
    699 		return 1
    700 	elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
    701 		return -1
    702 	else:
    703 		return 0
    704 
    705 def sequence_algorithmic_cmp(x, y):
    706 	if len(x) < len(y):
    707 		return -1
    708 	elif len(x) > len(y):
    709 		return 1
    710 	else:
    711 		for i in range(len(x)):
    712 			if x[i] < y[i]:
    713 				return -1
    714 			elif x[i] > y[i]:
    715 				return 1
    716 	return 0
    717 
    718 
    719 xorg_compose_sequences.sort(sequence_cmp)
    720 
    721 xorg_compose_sequences_uniqued = []
    722 first_time = True
    723 item = None
    724 for next_item in xorg_compose_sequences:
    725 	if first_time:
    726 		first_time = False
    727 		item = next_item
    728 	if sequence_unicode_cmp(item, next_item) != 0:
    729 		xorg_compose_sequences_uniqued.append(item)
    730 	item = next_item
    731 
    732 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
    733 
    734 counter_multikey = 0
    735 for item in xorg_compose_sequences:
    736 	if findall('Multi_key', "".join(item[:-1])) != []:
    737 		counter_multikey += 1
    738 
    739 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
    740 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
    741 
    742 firstitem = ""
    743 num_first_keysyms = 0
    744 zeroes = 0
    745 num_entries = 0
    746 num_algorithmic_greek = 0
    747 for sequence in xorg_compose_sequences:
    748 	if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
    749 		firstitem = sequence[0]
    750 		num_first_keysyms += 1
    751 	zeroes += 6 - len(sequence) + 1
    752 	num_entries += 1
    753 
    754 for sequence in xorg_compose_sequences_algorithmic_uniqued:
    755 	ch = ord(sequence[-1:][0])
    756 	if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
    757 		num_algorithmic_greek += 1
    758 		
    759 
    760 if opt_algorithmic:
    761 	for sequence in xorg_compose_sequences_algorithmic_uniqued:
    762 		letter = "".join(sequence[-1:])
    763 		print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
    764 		for elem in sequence[:-2]:
    765 			print "<0x%(keysym)04X>," % { 'keysym': elem },
    766 		""" Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
    767 		print "], recomposed as", letter.encode('utf-8'), "verified"
    768 
    769 def num_of_keysyms(seq):
    770 	return len(seq) - 1
    771 
    772 def convert_UnotationToHex(arg):
    773 	if isinstance(arg, str):
    774 		if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
    775 			return sub('^U', '0x', arg)
    776 	return arg
    777 
    778 def addprefix_GDK(arg):
    779 	if match('^0x', arg):
    780 		return '%(arg)s, ' % { 'arg': arg }
    781 	else:
    782 		return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
    783 
    784 if opt_gtk:
    785 	first_keysym = ""
    786 	sequence = []
    787 	compose_table = []
    788 	ct_second_part = []
    789 	ct_sequence_width = 2
    790 	start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
    791 	we_finished = False
    792 	counter = 0
    793 
    794 	sequence_iterator = iter(xorg_compose_sequences)
    795 	sequence = sequence_iterator.next()
    796 	while True:
    797 		first_keysym = sequence[0]					# Set the first keysym
    798 		compose_table.append([first_keysym, 0, 0, 0, 0, 0])
    799 		while sequence[0] == first_keysym:
    800 			compose_table[counter][num_of_keysyms(sequence)-1] += 1
    801 			try:
    802 				sequence = sequence_iterator.next()
    803 			except StopIteration:
    804 				we_finished = True
    805 				break
    806 		if we_finished:
    807 			break
    808 		counter += 1
    809 
    810 	ct_index = start_offset
    811 	for line_num in range(len(compose_table)):
    812 		for i in range(WIDTHOFCOMPOSETABLE):
    813 			occurences = compose_table[line_num][i+1]
    814 			compose_table[line_num][i+1] = ct_index
    815 			ct_index += occurences * (i+2)
    816 
    817 	for sequence in xorg_compose_sequences:
    818 		ct_second_part.append(map(convert_UnotationToHex, sequence))
    819 
    820 	print headerfile_start
    821 	for i in compose_table:
    822 		if opt_gtkexpanded:
    823 			print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
    824 			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
    825 		elif not match('^0x', i[0]):
    826 			print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
    827 		else:
    828 			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
    829 	for i in ct_second_part:
    830 		if opt_numeric:
    831 			for ks in i[1:][:-1]:
    832 				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
    833 			print '0x%(cp)04X, ' % { 'cp':i[-1] }
    834 			"""
    835 			for ks in i[:-1]:
    836 				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
    837 			print '0x%(cp)04X, ' % { 'cp':i[-1] }
    838 			"""
    839 		elif opt_gtkexpanded:
    840 			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
    841 		else:
    842 			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
    843 	print headerfile_end 
    844 
    845 def redecompose(codepoint):
    846 	(name, decomposition, combiningclass) = unicodedatabase[codepoint]
    847 	if decomposition[0] == '' or decomposition[0] == '0':
    848 		return [codepoint]
    849 	if match('<\w+>', decomposition[0]):
    850 		numdecomposition = map(stringtohex, decomposition[1:])
    851 		return map(redecompose, numdecomposition)
    852 	numdecomposition = map(stringtohex, decomposition)
    853 	return map(redecompose, numdecomposition)
    854 
    855 def process_unicodedata_file(verbose = False):
    856 	""" Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
    857 	filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
    858 	try: 
    859 		unicodedatatxt = open(filename_unicodedatatxt, 'r')
    860 	except IOError, (errno, strerror):
    861 		print "I/O error(%s): %s" % (errno, strerror)
    862 		sys.exit(-1)
    863 	except:
    864 		print "Unexpected error: ", sys.exc_info()[0]
    865 		sys.exit(-1)
    866 	for line in unicodedatatxt.readlines():
    867 		if line[0] == "" or line[0] == '#':
    868 			continue
    869 		line = line[:-1]
    870 		uniproperties = split(';', line)
    871 		codepoint = stringtohex(uniproperties[0])
    872 		""" We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
    873 		if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
    874 			continue
    875 		name = uniproperties[1]
    876 		category = uniproperties[2]
    877 		combiningclass = uniproperties[3]
    878 		decomposition = uniproperties[5]
    879 		unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
    880 	
    881 	counter_combinations = 0
    882 	counter_combinations_greek = 0
    883 	counter_entries = 0
    884 	counter_entries_greek = 0
    885 
    886 	for item in unicodedatabase.keys():
    887 		(name, decomposition, combiningclass) = unicodedatabase[item]
    888 		if decomposition[0] == '':
    889 			continue
    890 			print name, "is empty"
    891 		elif match('<\w+>', decomposition[0]):
    892 			continue
    893 			print name, "has weird", decomposition[0]
    894 		else:
    895 			sequence = map(stringtohex, decomposition)
    896 			chrsequence = map(unichr, sequence)
    897 			normalized = normalize('NFC', "".join(chrsequence))
    898 			
    899 			""" print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
    900 			decomposedsequence = []
    901 			for subseq in map(redecompose, sequence):
    902 				for seqitem in subseq:
    903 					if isinstance(seqitem, list):
    904 						for i in seqitem:
    905 							if isinstance(i, list):
    906 								for j in i:
    907 									decomposedsequence.append(j)
    908 							else:
    909 								decomposedsequence.append(i)
    910 					else:
    911 						decomposedsequence.append(seqitem)
    912 			recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
    913 			if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
    914 				counter_entries += 1
    915 				counter_combinations += factorial(len(decomposedsequence)-1)
    916 				ch = item
    917 				if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
    918 					counter_entries_greek += 1
    919 					counter_combinations_greek += factorial(len(decomposedsequence)-1)
    920 				if verbose:
    921 					print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
    922 					print "[",
    923 					for elem in decomposedsequence:
    924 						print '<0x%(hex)04X>,' % { 'hex': elem },
    925 					print "], recomposed as", recomposedchar,
    926 					if unichr(item) == recomposedchar:
    927 						print "verified"
    928 	
    929 	if verbose == False:
    930 		print "Unicode statistics from UnicodeData.txt"
    931 		print "Number of entries that can be algorithmically produced     :", counter_entries
    932 		print "  of which are for Greek                                   :", counter_entries_greek
    933 		print "Number of compose sequence combinations requiring          :", counter_combinations
    934 		print "  of which are for Greek                                   :", counter_combinations_greek
    935 		print "Note: We do not include partial compositions, "
    936 		print "thus the slight discrepancy in the figures"
    937 		print
    938 
    939 if opt_unicodedatatxt:
    940 	process_unicodedata_file(True)
    941 
    942 if opt_statistics:
    943 	print
    944 	print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
    945 	print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
    946 	print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
    947 	print "    of which have Multi_key                                :", counter_multikey
    948 	print 
    949 	print "Algorithmic (stats for Xorg Compose file)"
    950 	print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
    951 	print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
    952 	print "  of which are for Greek                                   :", num_algorithmic_greek
    953 	print 
    954 	process_unicodedata_file()
    955 	print "Not algorithmic (stats from Xorg Compose file)"
    956 	print "Number of sequences                                        :", len(xorg_compose_sequences) 
    957 	print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
    958 	print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
    959 	print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
    960 	print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
    961 	print "Number of different first items                            :", num_first_keysyms
    962 	print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
    963 	print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
    964 	print 
    965 	print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
    966 	print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
    967 	print
    968 	print "Existing (old) implementation in GTK+"
    969 	print "Number of sequences in old gtkimcontextsimple.c            :", 691
    970 	print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"
    971