Home | History | Annotate | Download | only in src
      1 #!/usr/bin/env python
      2 
      3 from __future__ import print_function, division, absolute_import
      4 
      5 import io, sys
      6 
      7 if len (sys.argv) != 5:
      8 	print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr)
      9 	sys.exit (1)
     10 
     11 BLACKLISTED_BLOCKS = ["Thai", "Lao"]
     12 
     13 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]]
     14 
     15 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2]
     16 headers.append (["UnicodeData.txt does not have a header."])
     17 
     18 data = [{} for f in files]
     19 values = [{} for f in files]
     20 for i, f in enumerate (files):
     21 	for line in f:
     22 
     23 		j = line.find ('#')
     24 		if j >= 0:
     25 			line = line[:j]
     26 
     27 		fields = [x.strip () for x in line.split (';')]
     28 		if len (fields) == 1:
     29 			continue
     30 
     31 		uu = fields[0].split ('..')
     32 		start = int (uu[0], 16)
     33 		if len (uu) == 1:
     34 			end = start
     35 		else:
     36 			end = int (uu[1], 16)
     37 
     38 		t = fields[1 if i != 2 else 2]
     39 
     40 		for u in range (start, end + 1):
     41 			data[i][u] = t
     42 		values[i][t] = values[i].get (t, 0) + end - start + 1
     43 
     44 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block')
     45 
     46 # TODO Characters that are not in Unicode Indic files, but used in USE
     47 data[0][0x034F] = defaults[0]
     48 data[0][0x2060] = defaults[0]
     49 data[0][0x20F0] = defaults[0]
     50 # TODO https://github.com/roozbehp/unicode-data/issues/9
     51 data[0][0x11C44] = 'Consonant_Placeholder'
     52 data[0][0x11C45] = 'Consonant_Placeholder'
     53 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399
     54 data[0][0x111C8] = 'Consonant_Placeholder'
     55 for u in range (0xFE00, 0xFE0F + 1):
     56 	data[0][u] = defaults[0]
     57 
     58 # Merge data into one dict:
     59 for i,v in enumerate (defaults):
     60 	values[i][v] = values[i].get (v, 0) + 1
     61 combined = {}
     62 for i,d in enumerate (data):
     63 	for u,v in d.items ():
     64 		if i >= 2 and not u in combined:
     65 			continue
     66 		if not u in combined:
     67 			combined[u] = list (defaults)
     68 		combined[u][i] = v
     69 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS}
     70 data = combined
     71 del combined
     72 num = len (data)
     73 
     74 
     75 property_names = [
     76 	# General_Category
     77 	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
     78 	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
     79 	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
     80 	# Indic_Syllabic_Category
     81 	'Other',
     82 	'Bindu',
     83 	'Visarga',
     84 	'Avagraha',
     85 	'Nukta',
     86 	'Virama',
     87 	'Pure_Killer',
     88 	'Invisible_Stacker',
     89 	'Vowel_Independent',
     90 	'Vowel_Dependent',
     91 	'Vowel',
     92 	'Consonant_Placeholder',
     93 	'Consonant',
     94 	'Consonant_Dead',
     95 	'Consonant_With_Stacker',
     96 	'Consonant_Prefixed',
     97 	'Consonant_Preceding_Repha',
     98 	'Consonant_Succeeding_Repha',
     99 	'Consonant_Subjoined',
    100 	'Consonant_Medial',
    101 	'Consonant_Final',
    102 	'Consonant_Head_Letter',
    103 	'Consonant_Initial_Postfixed',
    104 	'Modifying_Letter',
    105 	'Tone_Letter',
    106 	'Tone_Mark',
    107 	'Gemination_Mark',
    108 	'Cantillation_Mark',
    109 	'Register_Shifter',
    110 	'Syllable_Modifier',
    111 	'Consonant_Killer',
    112 	'Non_Joiner',
    113 	'Joiner',
    114 	'Number_Joiner',
    115 	'Number',
    116 	'Brahmi_Joining_Number',
    117 	# Indic_Positional_Category
    118 	'Not_Applicable',
    119 	'Right',
    120 	'Left',
    121 	'Visual_Order_Left',
    122 	'Left_And_Right',
    123 	'Top',
    124 	'Bottom',
    125 	'Top_And_Bottom',
    126 	'Top_And_Right',
    127 	'Top_And_Left',
    128 	'Top_And_Left_And_Right',
    129 	'Bottom_And_Left',
    130 	'Bottom_And_Right',
    131 	'Top_And_Bottom_And_Right',
    132 	'Overstruck',
    133 ]
    134 
    135 try:
    136 	basestring
    137 except NameError:
    138 	basestring = str
    139 
    140 class PropertyValue(object):
    141 	def __init__(self, name_):
    142 		self.name = name_
    143 	def __str__(self):
    144 		return self.name
    145 	def __eq__(self, other):
    146 		return self.name == (other if isinstance(other, basestring) else other.name)
    147 	def __ne__(self, other):
    148 		return not (self == other)
    149 	def __hash__(self):
    150 		return hash(str(self))
    151 
    152 property_values = {}
    153 
    154 for name in property_names:
    155 	value = PropertyValue(name)
    156 	assert value not in property_values
    157 	assert value not in globals()
    158 	property_values[name] = value
    159 globals().update(property_values)
    160 
    161 
    162 def is_BASE(U, UISC, UGC):
    163 	return (UISC in [Number, Consonant, Consonant_Head_Letter,
    164 			#SPEC-DRAFT Consonant_Placeholder,
    165 			Tone_Letter,
    166 			Vowel_Independent #SPEC-DRAFT
    167 			] or
    168 		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
    169 					Consonant_Subjoined, Vowel, Vowel_Dependent]))
    170 def is_BASE_IND(U, UISC, UGC):
    171 	#SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po)
    172 	return (UISC in [Consonant_Dead, Modifying_Letter] or
    173 		(UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or
    174 		False # SPEC-DRAFT-OUTDATED! U == 0x002D
    175 		)
    176 def is_BASE_NUM(U, UISC, UGC):
    177 	return UISC == Brahmi_Joining_Number
    178 def is_BASE_OTHER(U, UISC, UGC):
    179 	if UISC == Consonant_Placeholder: return True #SPEC-DRAFT
    180 	#SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
    181 	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
    182 def is_CGJ(U, UISC, UGC):
    183 	return U == 0x034F
    184 def is_CONS_FINAL(U, UISC, UGC):
    185 	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
    186 	return ((UISC == Consonant_Final and UGC != Lo) or
    187 		UISC == Consonant_Initial_Postfixed or
    188 		UISC == Consonant_Succeeding_Repha)
    189 def is_CONS_FINAL_MOD(U, UISC, UGC):
    190 	#SPEC-DRAFT return  UISC in [Consonant_Final_Modifier, Syllable_Modifier]
    191 	return  UISC == Syllable_Modifier
    192 def is_CONS_MED(U, UISC, UGC):
    193 	return UISC == Consonant_Medial and UGC != Lo
    194 def is_CONS_MOD(U, UISC, UGC):
    195 	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
    196 def is_CONS_SUB(U, UISC, UGC):
    197 	#SPEC-DRAFT return UISC == Consonant_Subjoined
    198 	return UISC == Consonant_Subjoined and UGC != Lo
    199 def is_CONS_WITH_STACKER(U, UISC, UGC):
    200 	return UISC == Consonant_With_Stacker
    201 def is_HALANT(U, UISC, UGC):
    202 	return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC)
    203 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC):
    204 	# https://github.com/harfbuzz/harfbuzz/issues/1102
    205 	# https://github.com/harfbuzz/harfbuzz/issues/1379
    206 	return U in [0x11046, 0x1134D]
    207 def is_HALANT_NUM(U, UISC, UGC):
    208 	return UISC == Number_Joiner
    209 def is_ZWNJ(U, UISC, UGC):
    210 	return UISC == Non_Joiner
    211 def is_ZWJ(U, UISC, UGC):
    212 	return UISC == Joiner
    213 def is_Word_Joiner(U, UISC, UGC):
    214 	return U == 0x2060
    215 def is_OTHER(U, UISC, UGC):
    216 	#SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters
    217 	return (UISC == Other
    218 		and not is_SYM_MOD(U, UISC, UGC)
    219 		and not is_CGJ(U, UISC, UGC)
    220 		and not is_Word_Joiner(U, UISC, UGC)
    221 		and not is_VARIATION_SELECTOR(U, UISC, UGC)
    222 	)
    223 def is_Reserved(U, UISC, UGC):
    224 	return UGC == 'Cn'
    225 def is_REPHA(U, UISC, UGC):
    226 	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
    227 def is_SYM(U, UISC, UGC):
    228 	if U == 0x25CC: return False #SPEC-DRAFT
    229 	#SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter
    230 	return UGC in [So, Sc]
    231 def is_SYM_MOD(U, UISC, UGC):
    232 	return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73]
    233 def is_VARIATION_SELECTOR(U, UISC, UGC):
    234 	return 0xFE00 <= U <= 0xFE0F
    235 def is_VOWEL(U, UISC, UGC):
    236 	# https://github.com/roozbehp/unicode-data/issues/6
    237 	return (UISC == Pure_Killer or
    238 		(UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29]))
    239 def is_VOWEL_MOD(U, UISC, UGC):
    240 	# https://github.com/roozbehp/unicode-data/issues/6
    241 	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
    242 		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
    243 
    244 use_mapping = {
    245 	'B':	is_BASE,
    246 	'IND':	is_BASE_IND,
    247 	'N':	is_BASE_NUM,
    248 	'GB':	is_BASE_OTHER,
    249 	'CGJ':	is_CGJ,
    250 	'F':	is_CONS_FINAL,
    251 	'FM':	is_CONS_FINAL_MOD,
    252 	'M':	is_CONS_MED,
    253 	'CM':	is_CONS_MOD,
    254 	'SUB':	is_CONS_SUB,
    255 	'CS':	is_CONS_WITH_STACKER,
    256 	'H':	is_HALANT,
    257 	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
    258 	'HN':	is_HALANT_NUM,
    259 	'ZWNJ':	is_ZWNJ,
    260 	'ZWJ':	is_ZWJ,
    261 	'WJ':	is_Word_Joiner,
    262 	'O':	is_OTHER,
    263 	'Rsv':	is_Reserved,
    264 	'R':	is_REPHA,
    265 	'S':	is_SYM,
    266 	'SM':	is_SYM_MOD,
    267 	'VS':	is_VARIATION_SELECTOR,
    268 	'V':	is_VOWEL,
    269 	'VM':	is_VOWEL_MOD,
    270 }
    271 
    272 use_positions = {
    273 	'F': {
    274 		'Abv': [Top],
    275 		'Blw': [Bottom],
    276 		'Pst': [Right],
    277 	},
    278 	'M': {
    279 		'Abv': [Top],
    280 		'Blw': [Bottom, Bottom_And_Left],
    281 		'Pst': [Right],
    282 		'Pre': [Left],
    283 	},
    284 	'CM': {
    285 		'Abv': [Top],
    286 		'Blw': [Bottom],
    287 	},
    288 	'V': {
    289 		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
    290 		'Blw': [Bottom, Overstruck, Bottom_And_Right],
    291 		'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
    292 		'Pre': [Left],
    293 	},
    294 	'VM': {
    295 		'Abv': [Top],
    296 		'Blw': [Bottom, Overstruck],
    297 		'Pst': [Right],
    298 		'Pre': [Left],
    299 	},
    300 	'SM': {
    301 		'Abv': [Top],
    302 		'Blw': [Bottom],
    303 	},
    304 	'H': None,
    305 	'HVM': None,
    306 	'B': None,
    307 	'FM': None,
    308 	'SUB': None,
    309 }
    310 
    311 def map_to_use(data):
    312 	out = {}
    313 	items = use_mapping.items()
    314 	for U,(UISC,UIPC,UGC,UBlock) in data.items():
    315 
    316 		# Resolve Indic_Syllabic_Category
    317 
    318 		# TODO: These don't have UISC assigned in Unicode 8.0, but have UIPC
    319 		if U == 0x17DD: UISC = Vowel_Dependent
    320 		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
    321 
    322 		# Tibetan:
    323 		# TODO: These don't have UISC assigned in Unicode 11.0, but have UIPC
    324 		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
    325 		if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark
    326 		# Overrides to allow NFC order matching syllable
    327 		# https://github.com/harfbuzz/harfbuzz/issues/1012
    328 		if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC):
    329 			if UIPC == Top:
    330 				UIPC = Bottom
    331 
    332 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/982
    333 		# also  https://github.com/harfbuzz/harfbuzz/issues/1012
    334 		if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC):
    335 			if UIPC == Top:
    336 				UIPC = Bottom
    337 			elif UIPC == Bottom:
    338 				UIPC = Top
    339 
    340 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/627
    341 		if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom
    342 
    343 		# TODO: U+1CED should only be allowed after some of
    344 		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
    345 		if U == 0x1CED: UISC = Tone_Mark
    346 
    347 		# TODO: https://github.com/harfbuzz/harfbuzz/issues/525
    348 		if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom
    349 
    350 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/609
    351 		if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top
    352 
    353 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/626
    354 		if U == 0xA8B4: UISC = Consonant_Medial
    355 
    356 		# TODO: https://github.com/harfbuzz/harfbuzz/issues/1105
    357 		if U == 0x11134: UISC = Gemination_Mark
    358 
    359 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1399
    360 		if U == 0x111C9: UISC = Consonant_Final
    361 
    362 		values = [k for k,v in items if v(U,UISC,UGC)]
    363 		assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values)
    364 		USE = values[0]
    365 
    366 		# Resolve Indic_Positional_Category
    367 
    368 		# TODO: Not in Unicode 8.0 yet, but in spec.
    369 		if U == 0x1B6C: UIPC = Bottom
    370 
    371 		# TODO: These should die, but have UIPC in Unicode 8.0
    372 		if U in [0x953, 0x954]: UIPC = Not_Applicable
    373 
    374 		# TODO: In USE's override list but not in Unicode 11.0
    375 		if U == 0x103C: UIPC = Left
    376 
    377 		# TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0
    378 		if 0xA926 <= U <= 0xA92A: UIPC = Top
    379 		if U == 0x111CA: UIPC = Bottom
    380 		if U == 0x11300: UIPC = Top
    381 		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
    382 		if U == 0x11302: UIPC = Top
    383 		if U == 0x1133C: UIPC = Bottom
    384 		if U == 0x1171E: UIPC = Left # Correct?!
    385 		if 0x1CF2 <= U <= 0x1CF3: UIPC = Right
    386 		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
    387 		# https://github.com/roozbehp/unicode-data/issues/8
    388 		if U == 0x0A51: UIPC = Bottom
    389 
    390 		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
    391 			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
    392 
    393 		pos_mapping = use_positions.get(USE, None)
    394 		if pos_mapping:
    395 			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
    396 			assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values)
    397 			USE = USE + values[0]
    398 
    399 		out[U] = (USE, UBlock)
    400 	return out
    401 
    402 defaults = ('O', 'No_Block')
    403 data = map_to_use(data)
    404 
    405 print ("/* == Start of generated table == */")
    406 print ("/*")
    407 print (" * The following table is generated by running:")
    408 print (" *")
    409 print (" *   ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt")
    410 print (" *")
    411 print (" * on files with these headers:")
    412 print (" *")
    413 for h in headers:
    414 	for l in h:
    415 		print (" * %s" % (l.strip()))
    416 print (" */")
    417 print ()
    418 print ('#include "hb-ot-shape-complex-use.hh"')
    419 print ()
    420 
    421 total = 0
    422 used = 0
    423 last_block = None
    424 def print_block (block, start, end, data):
    425 	global total, used, last_block
    426 	if block and block != last_block:
    427 		print ()
    428 		print ()
    429 		print ("  /* %s */" % block)
    430 		if start % 16:
    431 			print (' ' * (20 + (start % 16 * 6)), end='')
    432 	num = 0
    433 	assert start % 8 == 0
    434 	assert (end+1) % 8 == 0
    435 	for u in range (start, end+1):
    436 		if u % 16 == 0:
    437 			print ()
    438 			print ("  /* %04X */" % u, end='')
    439 		if u in data:
    440 			num += 1
    441 		d = data.get (u, defaults)
    442 		print ("%6s," % d[0], end='')
    443 
    444 	total += end - start + 1
    445 	used += num
    446 	if block:
    447 		last_block = block
    448 
    449 uu = sorted (data.keys ())
    450 
    451 last = -100000
    452 num = 0
    453 offset = 0
    454 starts = []
    455 ends = []
    456 for k,v in sorted(use_mapping.items()):
    457 	if k in use_positions and use_positions[k]: continue
    458 	print ("#define %s	USE_%s	/* %s */" % (k, k, v.__name__[3:]))
    459 for k,v in sorted(use_positions.items()):
    460 	if not v: continue
    461 	for suf in v.keys():
    462 		tag = k + suf
    463 		print ("#define %s	USE_%s" % (tag, tag))
    464 print ("")
    465 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {")
    466 for u in uu:
    467 	if u <= last:
    468 		continue
    469 	block = data[u][1]
    470 
    471 	start = u//8*8
    472 	end = start+1
    473 	while end in uu and block == data[end][1]:
    474 		end += 1
    475 	end = (end-1)//8*8 + 7
    476 
    477 	if start != last + 1:
    478 		if start - last <= 1+16*3:
    479 			print_block (None, last+1, start-1, data)
    480 			last = start-1
    481 		else:
    482 			if last >= 0:
    483 				ends.append (last + 1)
    484 				offset += ends[-1] - starts[-1]
    485 			print ()
    486 			print ()
    487 			print ("#define use_offset_0x%04xu %d" % (start, offset))
    488 			starts.append (start)
    489 
    490 	print_block (block, start, end, data)
    491 	last = end
    492 ends.append (last + 1)
    493 offset += ends[-1] - starts[-1]
    494 print ()
    495 print ()
    496 occupancy = used * 100. / total
    497 page_bits = 12
    498 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
    499 print ()
    500 print ("USE_TABLE_ELEMENT_TYPE")
    501 print ("hb_use_get_category (hb_codepoint_t u)")
    502 print ("{")
    503 print ("  switch (u >> %d)" % page_bits)
    504 print ("  {")
    505 pages = set([u>>page_bits for u in starts+ends])
    506 for p in sorted(pages):
    507 	print ("    case 0x%0Xu:" % p)
    508 	for (start,end) in zip (starts, ends):
    509 		if p not in [start>>page_bits, end>>page_bits]: continue
    510 		offset = "use_offset_0x%04xu" % start
    511 		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
    512 	print ("      break;")
    513 	print ("")
    514 print ("    default:")
    515 print ("      break;")
    516 print ("  }")
    517 print ("  return USE_O;")
    518 print ("}")
    519 print ()
    520 for k in sorted(use_mapping.keys()):
    521 	if k in use_positions and use_positions[k]: continue
    522 	print ("#undef %s" % k)
    523 for k,v in sorted(use_positions.items()):
    524 	if not v: continue
    525 	for suf in v.keys():
    526 		tag = k + suf
    527 		print ("#undef %s" % tag)
    528 print ()
    529 print ("/* == End of generated table == */")
    530 
    531 # Maintain at least 50% occupancy in the table */
    532 if occupancy < 50:
    533 	raise Exception ("Table too sparse, please investigate: ", occupancy)
    534