Home | History | Annotate | Download | only in src
      1 #!/usr/bin/python
      2 
      3 import sys
      4 
      5 if len (sys.argv) != 5:
      6 	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
      7 	sys.exit (1)
      8 
      9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
     10 
     11 files = [file (x) for x in sys.argv[1:]]
     12 
     13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
     14 headers.append (["UnicodeData.txt does not have a header."])
     15 
     16 data = [{} for f in files]
     17 values = [{} for f in files]
     18 for i, f in enumerate (files):
     19 	for line in f:
     20 
     21 		j = line.find ('#')
     22 		if j >= 0:
     23 			line = line[:j]
     24 
     25 		fields = [x.strip () for x in line.split (';')]
     26 		if len (fields) == 1:
     27 			continue
     28 
     29 		uu = fields[0].split ('..')
     30 		start = int (uu[0], 16)
     31 		if len (uu) == 1:
     32 			end = start
     33 		else:
     34 			end = int (uu[1], 16)
     35 
     36 		t = fields[1 if i != 2 else 2]
     37 
     38 		for u in range (start, end + 1):
     39 			data[i][u] = t
     40 		values[i][t] = values[i].get (t, 0) + end - start + 1
     41 
     42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
     43 
     44 # TODO Characters that are not in Unicode Indic files, but used in USE
     45 data[0][0x034F] = defaults[0]
     46 data[0][0x2060] = defaults[0]
     47 for u in range (0xFE00, 0xFE0F + 1):
     48 	data[0][u] = defaults[0]
     49 
     50 # Merge data into one dict:
     51 for i,v in enumerate (defaults):
     52 	values[i][v] = values[i].get (v, 0) + 1
     53 combined = {}
     54 for i,d in enumerate (data):
     55 	for u,v in d.items ():
     56 		if i >= 2 and not u in combined:
     57 			continue
     58 		if not u in combined:
     59 			combined[u] = list (defaults)
     60 		combined[u][i] = v
     61 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
     62 data = combined
     63 del combined
     64 num = len (data)
     65 
     66 
     67 property_names = [
     68 	# General_Category
     69 	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
     70 	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
     71 	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
     72 	# Indic_Syllabic_Category
     73 	'Other',
     74 	'Bindu',
     75 	'Visarga',
     76 	'Avagraha',
     77 	'Nukta',
     78 	'Virama',
     79 	'Pure_Killer',
     80 	'Invisible_Stacker',
     81 	'Vowel_Independent',
     82 	'Vowel_Dependent',
     83 	'Vowel',
     84 	'Consonant_Placeholder',
     85 	'Consonant',
     86 	'Consonant_Dead',
     87 	'Consonant_With_Stacker',
     88 	'Consonant_Prefixed',
     89 	'Consonant_Preceding_Repha',
     90 	'Consonant_Succeeding_Repha',
     91 	'Consonant_Subjoined',
     92 	'Consonant_Medial',
     93 	'Consonant_Final',
     94 	'Consonant_Head_Letter',
     95 	'Modifying_Letter',
     96 	'Tone_Letter',
     97 	'Tone_Mark',
     98 	'Gemination_Mark',
     99 	'Cantillation_Mark',
    100 	'Register_Shifter',
    101 	'Syllable_Modifier',
    102 	'Consonant_Killer',
    103 	'Non_Joiner',
    104 	'Joiner',
    105 	'Number_Joiner',
    106 	'Number',
    107 	'Brahmi_Joining_Number',
    108 	# Indic_Positional_Category
    109 	'Not_Applicable',
    110 	'Right',
    111 	'Left',
    112 	'Visual_Order_Left',
    113 	'Left_And_Right',
    114 	'Top',
    115 	'Bottom',
    116 	'Top_And_Bottom',
    117 	'Top_And_Right',
    118 	'Top_And_Left',
    119 	'Top_And_Left_And_Right',
    120 	'Bottom_And_Right',
    121 	'Top_And_Bottom_And_Right',
    122 	'Overstruck',
    123 ]
    124 
    125 class PropertyValue(object):
    126 	def __init__(self, name_):
    127 		self.name = name_
    128 	def __str__(self):
    129 		return self.name
    130 	def __eq__(self, other):
    131 		return self.name == (other if isinstance(other, basestring) else other.name)
    132 	def __ne__(self, other):
    133 		return not (self == other)
    134 
    135 property_values = {}
    136 
    137 for name in property_names:
    138 	value = PropertyValue(name)
    139 	assert value not in property_values
    140 	assert value not in globals()
    141 	property_values[name] = value
    142 globals().update(property_values)
    143 
    144 
    145 def is_BASE(U, UISC, UGC):
    146 	return (UISC in [Number, Consonant, Consonant_Head_Letter,
    147 			#SPEC-OUTDATED Consonant_Placeholder,
    148 			Tone_Letter] or
    149 		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
    150 					Consonant_Subjoined, Vowel, Vowel_Dependent]))
    151 def is_BASE_VOWEL(U, UISC, UGC):
    152 	return UISC == Vowel_Independent
    153 def is_BASE_IND(U, UISC, UGC):
    154 	#SPEC-BROKEN return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
    155 	return (UISC in [Consonant_Dead, Modifying_Letter] or
    156 		(UGC == Po and not is_BASE_OTHER(U, UISC, UGC))) # for 104E
    157 def is_BASE_NUM(U, UISC, UGC):
    158 	return UISC == Brahmi_Joining_Number
    159 def is_BASE_OTHER(U, UISC, UGC):
    160 	if UISC == Consonant_Placeholder: return True #SPEC-OUTDATED
    161 	return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC,
    162 		     0x25FB, 0x25FC, 0x25FD, 0x25FE]
    163 def is_CGJ(U, UISC, UGC):
    164 	return U == 0x034F
    165 def is_CONS_FINAL(U, UISC, UGC):
    166 	return ((UISC == Consonant_Final and UGC != Lo) or
    167 		UISC == Consonant_Succeeding_Repha)
    168 def is_CONS_FINAL_MOD(U, UISC, UGC):
    169 	#SPEC-OUTDATED return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
    170 	return  UISC == Syllable_Modifier
    171 def is_CONS_MED(U, UISC, UGC):
    172 	return UISC == Consonant_Medial and UGC != Lo
    173 def is_CONS_MOD(U, UISC, UGC):
    174 	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
    175 def is_CONS_SUB(U, UISC, UGC):
    176 	#SPEC-OUTDATED return UISC == Consonant_Subjoined
    177 	return UISC == Consonant_Subjoined and UGC != Lo
    178 def is_HALANT(U, UISC, UGC):
    179 	return UISC in [Virama, Invisible_Stacker]
    180 def is_HALANT_NUM(U, UISC, UGC):
    181 	return UISC == Number_Joiner
    182 def is_ZWNJ(U, UISC, UGC):
    183 	return UISC == Non_Joiner
    184 def is_ZWJ(U, UISC, UGC):
    185 	return UISC == Joiner
    186 def is_Word_Joiner(U, UISC, UGC):
    187 	return U == 0x2060
    188 def is_OTHER(U, UISC, UGC):
    189 	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
    190 	return (UISC == Other
    191 		and not is_SYM_MOD(U, UISC, UGC)
    192 		and not is_CGJ(U, UISC, UGC)
    193 		and not is_Word_Joiner(U, UISC, UGC)
    194 		and not is_VARIATION_SELECTOR(U, UISC, UGC)
    195 	)
    196 def is_Reserved(U, UISC, UGC):
    197 	return UGC == 'Cn'
    198 def is_REPHA(U, UISC, UGC):
    199 	#return UISC == Consonant_Preceding_Repha
    200 	#SPEC-OUTDATED hack to categorize Consonant_With_Stacker and Consonant_Prefixed
    201 	return UISC in [Consonant_Preceding_Repha, Consonant_With_Stacker, Consonant_Prefixed]
    202 def is_SYM(U, UISC, UGC):
    203 	if U == 0x25CC: return False #SPEC-OUTDATED
    204 	#SPEC-OUTDATED return UGC in [So, Sc] or UISC == Symbol_Letter
    205 	return UGC in [So, Sc]
    206 def is_SYM_MOD(U, UISC, UGC):
    207 	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
    208 def is_VARIATION_SELECTOR(U, UISC, UGC):
    209 	return 0xFE00 <= U <= 0xFE0F
    210 def is_VOWEL(U, UISC, UGC):
    211 	return (UISC == Pure_Killer or
    212 		(UGC != Lo and UISC in [Vowel, Vowel_Dependent]))
    213 def is_VOWEL_MOD(U, UISC, UGC):
    214 	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
    215 		(UGC != Lo and UISC == Bindu))
    216 
    217 use_mapping = {
    218 	'B':	is_BASE,
    219 	'IV':	is_BASE_VOWEL,
    220 	'IND':	is_BASE_IND,
    221 	'N':	is_BASE_NUM,
    222 	'GB':	is_BASE_OTHER,
    223 	'CGJ':	is_CGJ,
    224 	'F':	is_CONS_FINAL,
    225 	'FM':	is_CONS_FINAL_MOD,
    226 	'M':	is_CONS_MED,
    227 	'CM':	is_CONS_MOD,
    228 	'SUB':	is_CONS_SUB,
    229 	'H':	is_HALANT,
    230 	'HN':	is_HALANT_NUM,
    231 	'ZWNJ':	is_ZWNJ,
    232 	'ZWJ':	is_ZWJ,
    233 	'WJ':	is_Word_Joiner,
    234 	'O':	is_OTHER,
    235 	'Rsv':	is_Reserved,
    236 	'R':	is_REPHA,
    237 	'S':	is_SYM,
    238 	'SM':	is_SYM_MOD,
    239 	'VS':	is_VARIATION_SELECTOR,
    240 	'V':	is_VOWEL,
    241 	'VM':	is_VOWEL_MOD,
    242 }
    243 
    244 use_positions = {
    245 	'F': {
    246 		'Abv': [Top],
    247 		'Blw': [Bottom],
    248 		'Pst': [Right],
    249 	},
    250 	'M': {
    251 		'Abv': [Top],
    252 		'Blw': [Bottom],
    253 		'Pst': [Right],
    254 		'Pre': [Left],
    255 	},
    256 	'CM': {
    257 		'Abv': [Top],
    258 		'Blw': [Bottom],
    259 	},
    260 	'V': {
    261 		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
    262 		'Blw': [Bottom, Overstruck, Bottom_And_Right],
    263 		'Pst': [Right],
    264 		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
    265 	},
    266 	'VM': {
    267 		'Abv': [Top],
    268 		'Blw': [Bottom, Overstruck],
    269 		'Pst': [Right],
    270 		'Pre': [Left],
    271 	},
    272 	'SM': {
    273 		'Abv': [Top],
    274 		'Blw': [Bottom],
    275 	},
    276 	'H': None,
    277 	'B': None,
    278 	'FM': None,
    279 	'SUB': None,
    280 }
    281 
    282 def map_to_use(data):
    283 	out = {}
    284 	items = use_mapping.items()
    285 	for U,(UISC,UIPC,UGC,UBlock) in data.items():
    286 
    287 		# Resolve Indic_Syllabic_Category
    288 
    289 		# TODO: These don't have UISC assigned in Unicode 8.0, but
    290 		# have UIPC
    291 		if U == 0x17DD: UISC = Vowel_Dependent
    292 		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
    293 
    294 		# TODO: U+1CED should only be allowed after some of
    295 		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
    296 		if U == 0x1CED: UISC = Tone_Mark
    297 
    298 		evals = [(k, v(U,UISC,UGC)) for k,v in items]
    299 		values = [k for k,v in evals if v]
    300 		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
    301 		USE = values[0]
    302 
    303 		# Resolve Indic_Positional_Category
    304 
    305 		# TODO: Not in Unicode 8.0 yet, but in spec.
    306 		if U == 0x1B6C: UIPC = Bottom
    307 
    308 		# TODO: These should die, but have UIPC in Unicode 8.0
    309 		if U in [0x953, 0x954]: UIPC = Not_Applicable
    310 
    311 		# TODO: In USE's override list but not in Unicode 8.0
    312 		if U == 0x103C: UIPC = Left
    313 
    314 		# TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
    315 		if 0xA926 <= U <= 0xA92A: UIPC = Top
    316 		if U == 0x111CA: UIPC = Bottom
    317 		if U == 0x11300: UIPC = Top
    318 		if U == 0x1133C: UIPC = Bottom
    319 		if U == 0x1171E: UIPC = Left # Correct?!
    320 		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
    321 		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
    322 
    323 		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
    324 			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
    325 
    326 		pos_mapping = use_positions.get(USE, None)
    327 		if pos_mapping:
    328 			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
    329 			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
    330 			USE = USE + values[0]
    331 
    332 		out[U] = (USE, UBlock)
    333 	return out
    334 
    335 defaults = ('O', 'No_Block')
    336 data = map_to_use(data)
    337 
    338 # Remove the outliers
    339 singles = {}
    340 for u in [0x034F, 0x25CC, 0x1107F]:
    341 	singles[u] = data[u]
    342 	del data[u]
    343 
    344 print "/* == Start of generated table == */"
    345 print "/*"
    346 print " * The following table is generated by running:"
    347 print " *"
    348 print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
    349 print " *"
    350 print " * on files with these headers:"
    351 print " *"
    352 for h in headers:
    353 	for l in h:
    354 		print " * %s" % (l.strip())
    355 print " */"
    356 print
    357 print '#include "hb-ot-shape-complex-use-private.hh"'
    358 print
    359 
    360 total = 0
    361 used = 0
    362 last_block = None
    363 def print_block (block, start, end, data):
    364 	global total, used, last_block
    365 	if block and block != last_block:
    366 		print
    367 		print
    368 		print "  /* %s */" % block
    369 		if start % 16:
    370 			print ' ' * (20 + (start % 16 * 6)),
    371 	num = 0
    372 	assert start % 8 == 0
    373 	assert (end+1) % 8 == 0
    374 	for u in range (start, end+1):
    375 		if u % 16 == 0:
    376 			print
    377 			print "  /* %04X */" % u,
    378 		if u in data:
    379 			num += 1
    380 		d = data.get (u, defaults)
    381 		sys.stdout.write ("%6s," % d[0])
    382 
    383 	total += end - start + 1
    384 	used += num
    385 	if block:
    386 		last_block = block
    387 
    388 uu = data.keys ()
    389 uu.sort ()
    390 
    391 last = -100000
    392 num = 0
    393 offset = 0
    394 starts = []
    395 ends = []
    396 for k,v in sorted(use_mapping.items()):
    397 	if k in use_positions and use_positions[k]: continue
    398 	print "#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:])
    399 for k,v in sorted(use_positions.items()):
    400 	if not v: continue
    401 	for suf in v.keys():
    402 		tag = k + suf
    403 		print "#define %s	USE_%s" % (tag, tag)
    404 print ""
    405 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
    406 for u in uu:
    407 	if u <= last:
    408 		continue
    409 	block = data[u][1]
    410 
    411 	start = u//8*8
    412 	end = start+1
    413 	while end in uu and block == data[end][1]:
    414 		end += 1
    415 	end = (end-1)//8*8 + 7
    416 
    417 	if start != last + 1:
    418 		if start - last <= 1+16*3:
    419 			print_block (None, last+1, start-1, data)
    420 			last = start-1
    421 		else:
    422 			if last >= 0:
    423 				ends.append (last + 1)
    424 				offset += ends[-1] - starts[-1]
    425 			print
    426 			print
    427 			print "#define use_offset_0x%04xu %d" % (start, offset)
    428 			starts.append (start)
    429 
    430 	print_block (block, start, end, data)
    431 	last = end
    432 ends.append (last + 1)
    433 offset += ends[-1] - starts[-1]
    434 print
    435 print
    436 occupancy = used * 100. / total
    437 page_bits = 12
    438 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
    439 print
    440 print "USE_TABLE_ELEMENT_TYPE"
    441 print "hb_use_get_categories (hb_codepoint_t u)"
    442 print "{"
    443 print "  switch (u >> %d)" % page_bits
    444 print "  {"
    445 pages = set([u>>page_bits for u in starts+ends+singles.keys()])
    446 for p in sorted(pages):
    447 	print "    case 0x%0Xu:" % p
    448 	for (start,end) in zip (starts, ends):
    449 		if p not in [start>>page_bits, end>>page_bits]: continue
    450 		offset = "use_offset_0x%04xu" % start
    451 		print "      if (hb_in_range (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
    452 	for u,d in singles.items ():
    453 		if p != u>>page_bits: continue
    454 		print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
    455 	print "      break;"
    456 	print ""
    457 print "    default:"
    458 print "      break;"
    459 print "  }"
    460 print "  return USE_O;"
    461 print "}"
    462 print
    463 for k in sorted(use_mapping.keys()):
    464 	if k in use_positions and use_positions[k]: continue
    465 	print "#undef %s" % k
    466 for k,v in sorted(use_positions.items()):
    467 	if not v: continue
    468 	for suf in v.keys():
    469 		tag = k + suf
    470 		print "#undef %s" % tag
    471 print
    472 print "/* == End of generated table == */"
    473 
    474 # Maintain at least 50% occupancy in the table */
    475 if occupancy < 50:
    476 	raise Exception ("Table too sparse, please investigate: ", occupancy)
    477