Home | History | Annotate | Download | only in src
      1 #!/usr/bin/python
      2 
      3 import sys
      4 
      5 if len (sys.argv) != 5:
      6 	print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
      7 	sys.exit (1)
      8 
      9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]
     10 
     11 files = [file (x) for x in sys.argv[1:]]
     12 
     13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
     14 headers.append (["UnicodeData.txt does not have a header."])
     15 
     16 data = [{} for f in files]
     17 values = [{} for f in files]
     18 for i, f in enumerate (files):
     19 	for line in f:
     20 
     21 		j = line.find ('#')
     22 		if j >= 0:
     23 			line = line[:j]
     24 
     25 		fields = [x.strip () for x in line.split (';')]
     26 		if len (fields) == 1:
     27 			continue
     28 
     29 		uu = fields[0].split ('..')
     30 		start = int (uu[0], 16)
     31 		if len (uu) == 1:
     32 			end = start
     33 		else:
     34 			end = int (uu[1], 16)
     35 
     36 		t = fields[1 if i != 2 else 2]
     37 
     38 		for u in range (start, end + 1):
     39 			data[i][u] = t
     40 		values[i][t] = values[i].get (t, 0) + end - start + 1
     41 
     42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
     43 
     44 # TODO Characters that are not in Unicode Indic files, but used in USE
     45 data[0][0x034F] = defaults[0]
     46 data[0][0x2060] = defaults[0]
     47 data[0][0x20F0] = defaults[0]
     48 for u in range (0xFE00, 0xFE0F + 1):
     49 	data[0][u] = defaults[0]
     50 
     51 # Merge data into one dict:
     52 for i,v in enumerate (defaults):
     53 	values[i][v] = values[i].get (v, 0) + 1
     54 combined = {}
     55 for i,d in enumerate (data):
     56 	for u,v in d.items ():
     57 		if i >= 2 and not u in combined:
     58 			continue
     59 		if not u in combined:
     60 			combined[u] = list (defaults)
     61 		combined[u][i] = v
     62 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
     63 data = combined
     64 del combined
     65 num = len (data)
     66 
     67 
     68 property_names = [
     69 	# General_Category
     70 	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
     71 	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
     72 	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
     73 	# Indic_Syllabic_Category
     74 	'Other',
     75 	'Bindu',
     76 	'Visarga',
     77 	'Avagraha',
     78 	'Nukta',
     79 	'Virama',
     80 	'Pure_Killer',
     81 	'Invisible_Stacker',
     82 	'Vowel_Independent',
     83 	'Vowel_Dependent',
     84 	'Vowel',
     85 	'Consonant_Placeholder',
     86 	'Consonant',
     87 	'Consonant_Dead',
     88 	'Consonant_With_Stacker',
     89 	'Consonant_Prefixed',
     90 	'Consonant_Preceding_Repha',
     91 	'Consonant_Succeeding_Repha',
     92 	'Consonant_Subjoined',
     93 	'Consonant_Medial',
     94 	'Consonant_Final',
     95 	'Consonant_Head_Letter',
     96 	'Modifying_Letter',
     97 	'Tone_Letter',
     98 	'Tone_Mark',
     99 	'Gemination_Mark',
    100 	'Cantillation_Mark',
    101 	'Register_Shifter',
    102 	'Syllable_Modifier',
    103 	'Consonant_Killer',
    104 	'Non_Joiner',
    105 	'Joiner',
    106 	'Number_Joiner',
    107 	'Number',
    108 	'Brahmi_Joining_Number',
    109 	# Indic_Positional_Category
    110 	'Not_Applicable',
    111 	'Right',
    112 	'Left',
    113 	'Visual_Order_Left',
    114 	'Left_And_Right',
    115 	'Top',
    116 	'Bottom',
    117 	'Top_And_Bottom',
    118 	'Top_And_Right',
    119 	'Top_And_Left',
    120 	'Top_And_Left_And_Right',
    121 	'Bottom_And_Left',
    122 	'Bottom_And_Right',
    123 	'Top_And_Bottom_And_Right',
    124 	'Overstruck',
    125 ]
    126 
    127 class PropertyValue(object):
    128 	def __init__(self, name_):
    129 		self.name = name_
    130 	def __str__(self):
    131 		return self.name
    132 	def __eq__(self, other):
    133 		return self.name == (other if isinstance(other, basestring) else other.name)
    134 	def __ne__(self, other):
    135 		return not (self == other)
    136 
    137 property_values = {}
    138 
    139 for name in property_names:
    140 	value = PropertyValue(name)
    141 	assert value not in property_values
    142 	assert value not in globals()
    143 	property_values[name] = value
    144 globals().update(property_values)
    145 
    146 
    147 def is_BASE(U, UISC, UGC):
    148 	return (UISC in [Number, Consonant, Consonant_Head_Letter,
    149 			#SPEC-DRAFT Consonant_Placeholder,
    150 			Tone_Letter,
    151 			Vowel_Independent #SPEC-DRAFT
    152 			] or
    153 		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
    154 					Consonant_Subjoined, Vowel, Vowel_Dependent]))
    155 def is_BASE_IND(U, UISC, UGC):
    156 	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
    157 	return (UISC in [Consonant_Dead, Modifying_Letter] or
    158 		(UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or
    159 		False # SPEC-DRAFT-OUTDATED! U == 0x002D
    160 		)
    161 def is_BASE_NUM(U, UISC, UGC):
    162 	return UISC == Brahmi_Joining_Number
    163 def is_BASE_OTHER(U, UISC, UGC):
    164 	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
    165 	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
    166 	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
    167 def is_CGJ(U, UISC, UGC):
    168 	return U == 0x034F
    169 def is_CONS_FINAL(U, UISC, UGC):
    170 	return ((UISC == Consonant_Final and UGC != Lo) or
    171 		UISC == Consonant_Succeeding_Repha)
    172 def is_CONS_FINAL_MOD(U, UISC, UGC):
    173 	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
    174 	return  UISC == Syllable_Modifier
    175 def is_CONS_MED(U, UISC, UGC):
    176 	return UISC == Consonant_Medial and UGC != Lo
    177 def is_CONS_MOD(U, UISC, UGC):
    178 	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
    179 def is_CONS_SUB(U, UISC, UGC):
    180 	#SPEC-DRAFT return UISC == Consonant_Subjoined
    181 	return UISC == Consonant_Subjoined and UGC != Lo
    182 def is_CONS_WITH_STACKER(U, UISC, UGC):
    183 	return UISC == Consonant_With_Stacker
    184 def is_HALANT(U, UISC, UGC):
    185 	return UISC in [Virama, Invisible_Stacker]
    186 def is_HALANT_NUM(U, UISC, UGC):
    187 	return UISC == Number_Joiner
    188 def is_ZWNJ(U, UISC, UGC):
    189 	return UISC == Non_Joiner
    190 def is_ZWJ(U, UISC, UGC):
    191 	return UISC == Joiner
    192 def is_Word_Joiner(U, UISC, UGC):
    193 	return U == 0x2060
    194 def is_OTHER(U, UISC, UGC):
    195 	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
    196 	return (UISC == Other
    197 		and not is_SYM_MOD(U, UISC, UGC)
    198 		and not is_CGJ(U, UISC, UGC)
    199 		and not is_Word_Joiner(U, UISC, UGC)
    200 		and not is_VARIATION_SELECTOR(U, UISC, UGC)
    201 	)
    202 def is_Reserved(U, UISC, UGC):
    203 	return UGC == 'Cn'
    204 def is_REPHA(U, UISC, UGC):
    205 	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
    206 def is_SYM(U, UISC, UGC):
    207 	if U == 0x25CC: return False #SPEC-DRAFT
    208 	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
    209 	return UGC in [So, Sc]
    210 def is_SYM_MOD(U, UISC, UGC):
    211 	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
    212 def is_VARIATION_SELECTOR(U, UISC, UGC):
    213 	return 0xFE00 <= U <= 0xFE0F
    214 def is_VOWEL(U, UISC, UGC):
    215 	# https://github.com/roozbehp/unicode-data/issues/6
    216 	return (UISC == Pure_Killer or
    217 		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
    218 def is_VOWEL_MOD(U, UISC, UGC):
    219 	# https://github.com/roozbehp/unicode-data/issues/6
    220 	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
    221 		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
    222 
    223 use_mapping = {
    224 	'B':	is_BASE,
    225 	'IND':	is_BASE_IND,
    226 	'N':	is_BASE_NUM,
    227 	'GB':	is_BASE_OTHER,
    228 	'CGJ':	is_CGJ,
    229 	'F':	is_CONS_FINAL,
    230 	'FM':	is_CONS_FINAL_MOD,
    231 	'M':	is_CONS_MED,
    232 	'CM':	is_CONS_MOD,
    233 	'SUB':	is_CONS_SUB,
    234 	'CS':	is_CONS_WITH_STACKER,
    235 	'H':	is_HALANT,
    236 	'HN':	is_HALANT_NUM,
    237 	'ZWNJ':	is_ZWNJ,
    238 	'ZWJ':	is_ZWJ,
    239 	'WJ':	is_Word_Joiner,
    240 	'O':	is_OTHER,
    241 	'Rsv':	is_Reserved,
    242 	'R':	is_REPHA,
    243 	'S':	is_SYM,
    244 	'SM':	is_SYM_MOD,
    245 	'VS':	is_VARIATION_SELECTOR,
    246 	'V':	is_VOWEL,
    247 	'VM':	is_VOWEL_MOD,
    248 }
    249 
    250 use_positions = {
    251 	'F': {
    252 		'Abv': [Top],
    253 		'Blw': [Bottom],
    254 		'Pst': [Right],
    255 	},
    256 	'M': {
    257 		'Abv': [Top],
    258 		'Blw': [Bottom, Bottom_And_Left],
    259 		'Pst': [Right],
    260 		'Pre': [Left],
    261 	},
    262 	'CM': {
    263 		'Abv': [Top],
    264 		'Blw': [Bottom],
    265 	},
    266 	'V': {
    267 		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
    268 		'Blw': [Bottom, Overstruck, Bottom_And_Right],
    269 		'Pst': [Right],
    270 		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
    271 	},
    272 	'VM': {
    273 		'Abv': [Top],
    274 		'Blw': [Bottom, Overstruck],
    275 		'Pst': [Right],
    276 		'Pre': [Left],
    277 	},
    278 	'SM': {
    279 		'Abv': [Top],
    280 		'Blw': [Bottom],
    281 	},
    282 	'H': None,
    283 	'B': None,
    284 	'FM': None,
    285 	'SUB': None,
    286 }
    287 
    288 def map_to_use(data):
    289 	out = {}
    290 	items = use_mapping.items()
    291 	for U,(UISC,UIPC,UGC,UBlock) in data.items():
    292 
    293 		# Resolve Indic_Syllabic_Category
    294 
    295 		# TODO: These don't have UISC assigned in Unicode 8.0, but
    296 		# have UIPC
    297 		if U == 0x17DD: UISC = Vowel_Dependent
    298 		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
    299 
    300 		# TODO: U+1CED should only be allowed after some of
    301 		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
    302 		if U == 0x1CED: UISC = Tone_Mark
    303 
    304 		# TODO: https://github.com/harfbuzz/harfbuzz/issues/525
    305 		if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
    306 
    307 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/609
    308 		if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
    309 
    310 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/626
    311 		if U == 0xA8B4: UISC = Consonant_Medial
    312 
    313 		values = [k for k,v in items if v(U,UISC,UGC)]
    314 		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
    315 		USE = values[0]
    316 
    317 		# Resolve Indic_Positional_Category
    318 
    319 		# TODO: Not in Unicode 8.0 yet, but in spec.
    320 		if U == 0x1B6C: UIPC = Bottom
    321 
    322 		# TODO: These should die, but have UIPC in Unicode 8.0
    323 		if U in [0x953, 0x954]: UIPC = Not_Applicable
    324 
    325 		# TODO: In USE's override list but not in Unicode 8.0
    326 		if U == 0x103C: UIPC = Left
    327 
    328 		# TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0
    329 		if 0xA926 <= U <= 0xA92A: UIPC = Top
    330 		if U == 0x111CA: UIPC = Bottom
    331 		if U == 0x11300: UIPC = Top
    332 		if U == 0x1133C: UIPC = Bottom
    333 		if U == 0x1171E: UIPC = Left # Correct?!
    334 		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
    335 		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
    336 
    337 		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
    338 			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
    339 
    340 		pos_mapping = use_positions.get(USE, None)
    341 		if pos_mapping:
    342 			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
    343 			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
    344 			USE = USE + values[0]
    345 
    346 		out[U] = (USE, UBlock)
    347 	return out
    348 
    349 defaults = ('O', 'No_Block')
    350 data = map_to_use(data)
    351 
    352 # Remove the outliers
    353 singles = {}
    354 for u in [0x034F, 0x25CC, 0x1107F]:
    355 	singles[u] = data[u]
    356 	del data[u]
    357 
    358 print "/* == Start of generated table == */"
    359 print "/*"
    360 print " * The following table is generated by running:"
    361 print " *"
    362 print " *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt"
    363 print " *"
    364 print " * on files with these headers:"
    365 print " *"
    366 for h in headers:
    367 	for l in h:
    368 		print " * %s" % (l.strip())
    369 print " */"
    370 print
    371 print '#include "hb-ot-shape-complex-use-private.hh"'
    372 print
    373 
    374 total = 0
    375 used = 0
    376 last_block = None
    377 def print_block (block, start, end, data):
    378 	global total, used, last_block
    379 	if block and block != last_block:
    380 		print
    381 		print
    382 		print "  /* %s */" % block
    383 		if start % 16:
    384 			print ' ' * (20 + (start % 16 * 6)),
    385 	num = 0
    386 	assert start % 8 == 0
    387 	assert (end+1) % 8 == 0
    388 	for u in range (start, end+1):
    389 		if u % 16 == 0:
    390 			print
    391 			print "  /* %04X */" % u,
    392 		if u in data:
    393 			num += 1
    394 		d = data.get (u, defaults)
    395 		sys.stdout.write ("%6s," % d[0])
    396 
    397 	total += end - start + 1
    398 	used += num
    399 	if block:
    400 		last_block = block
    401 
    402 uu = data.keys ()
    403 uu.sort ()
    404 
    405 last = -100000
    406 num = 0
    407 offset = 0
    408 starts = []
    409 ends = []
    410 for k,v in sorted(use_mapping.items()):
    411 	if k in use_positions and use_positions[k]: continue
    412 	print "#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:])
    413 for k,v in sorted(use_positions.items()):
    414 	if not v: continue
    415 	for suf in v.keys():
    416 		tag = k + suf
    417 		print "#define %s	USE_%s" % (tag, tag)
    418 print ""
    419 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {"
    420 for u in uu:
    421 	if u <= last:
    422 		continue
    423 	block = data[u][1]
    424 
    425 	start = u//8*8
    426 	end = start+1
    427 	while end in uu and block == data[end][1]:
    428 		end += 1
    429 	end = (end-1)//8*8 + 7
    430 
    431 	if start != last + 1:
    432 		if start - last <= 1+16*3:
    433 			print_block (None, last+1, start-1, data)
    434 			last = start-1
    435 		else:
    436 			if last >= 0:
    437 				ends.append (last + 1)
    438 				offset += ends[-1] - starts[-1]
    439 			print
    440 			print
    441 			print "#define use_offset_0x%04xu %d" % (start, offset)
    442 			starts.append (start)
    443 
    444 	print_block (block, start, end, data)
    445 	last = end
    446 ends.append (last + 1)
    447 offset += ends[-1] - starts[-1]
    448 print
    449 print
    450 occupancy = used * 100. / total
    451 page_bits = 12
    452 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
    453 print
    454 print "USE_TABLE_ELEMENT_TYPE"
    455 print "hb_use_get_categories (hb_codepoint_t u)"
    456 print "{"
    457 print "  switch (u >> %d)" % page_bits
    458 print "  {"
    459 pages = set([u>>page_bits for u in starts+ends+singles.keys()])
    460 for p in sorted(pages):
    461 	print "    case 0x%0Xu:" % p
    462 	for (start,end) in zip (starts, ends):
    463 		if p not in [start>>page_bits, end>>page_bits]: continue
    464 		offset = "use_offset_0x%04xu" % start
    465 		print "      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
    466 	for u,d in singles.items ():
    467 		if p != u>>page_bits: continue
    468 		print "      if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0])
    469 	print "      break;"
    470 	print ""
    471 print "    default:"
    472 print "      break;"
    473 print "  }"
    474 print "  return USE_O;"
    475 print "}"
    476 print
    477 for k in sorted(use_mapping.keys()):
    478 	if k in use_positions and use_positions[k]: continue
    479 	print "#undef %s" % k
    480 for k,v in sorted(use_positions.items()):
    481 	if not v: continue
    482 	for suf in v.keys():
    483 		tag = k + suf
    484 		print "#undef %s" % tag
    485 print
    486 print "/* == End of generated table == */"
    487 
    488 # Maintain at least 50% occupancy in the table */
    489 if occupancy < 50:
    490 	raise Exception ("Table too sparse, please investigate: ", occupancy)
    491