1 #!/usr/bin/env python 2 3 from __future__ import print_function, division, absolute_import 4 5 import io, sys 6 7 if len (sys.argv) != 5: 8 print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr) 9 sys.exit (1) 10 11 BLACKLISTED_BLOCKS = ["Thai", "Lao"] 12 13 files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 14 15 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 16 headers.append (["UnicodeData.txt does not have a header."]) 17 18 data = [{} for f in files] 19 values = [{} for f in files] 20 for i, f in enumerate (files): 21 for line in f: 22 23 j = line.find ('#') 24 if j >= 0: 25 line = line[:j] 26 27 fields = [x.strip () for x in line.split (';')] 28 if len (fields) == 1: 29 continue 30 31 uu = fields[0].split ('..') 32 start = int (uu[0], 16) 33 if len (uu) == 1: 34 end = start 35 else: 36 end = int (uu[1], 16) 37 38 t = fields[1 if i != 2 else 2] 39 40 for u in range (start, end + 1): 41 data[i][u] = t 42 values[i][t] = values[i].get (t, 0) + end - start + 1 43 44 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 45 46 # TODO Characters that are not in Unicode Indic files, but used in USE 47 data[0][0x034F] = defaults[0] 48 data[0][0x2060] = defaults[0] 49 data[0][0x20F0] = defaults[0] 50 # TODO https://github.com/roozbehp/unicode-data/issues/9 51 data[0][0x11C44] = 'Consonant_Placeholder' 52 data[0][0x11C45] = 'Consonant_Placeholder' 53 # TODO https://github.com/harfbuzz/harfbuzz/pull/1399 54 data[0][0x111C8] = 'Consonant_Placeholder' 55 for u in range (0xFE00, 0xFE0F + 1): 56 data[0][u] = defaults[0] 57 58 # Merge data into one dict: 59 for i,v in enumerate (defaults): 60 values[i][v] = values[i].get (v, 0) + 1 61 combined = {} 62 for i,d in enumerate (data): 63 for u,v in d.items (): 64 if i >= 2 and not u in combined: 65 continue 66 if not u in combined: 67 combined[u] = list (defaults) 68 combined[u][i] = v 69 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 70 data = combined 71 del combined 72 num = len (data) 73 74 75 property_names = [ 76 # General_Category 77 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 78 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 79 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 80 # Indic_Syllabic_Category 81 'Other', 82 'Bindu', 83 'Visarga', 84 'Avagraha', 85 'Nukta', 86 'Virama', 87 'Pure_Killer', 88 'Invisible_Stacker', 89 'Vowel_Independent', 90 'Vowel_Dependent', 91 'Vowel', 92 'Consonant_Placeholder', 93 'Consonant', 94 'Consonant_Dead', 95 'Consonant_With_Stacker', 96 'Consonant_Prefixed', 97 'Consonant_Preceding_Repha', 98 'Consonant_Succeeding_Repha', 99 'Consonant_Subjoined', 100 'Consonant_Medial', 101 'Consonant_Final', 102 'Consonant_Head_Letter', 103 'Consonant_Initial_Postfixed', 104 'Modifying_Letter', 105 'Tone_Letter', 106 'Tone_Mark', 107 'Gemination_Mark', 108 'Cantillation_Mark', 109 'Register_Shifter', 110 'Syllable_Modifier', 111 'Consonant_Killer', 112 'Non_Joiner', 113 'Joiner', 114 'Number_Joiner', 115 'Number', 116 'Brahmi_Joining_Number', 117 # Indic_Positional_Category 118 'Not_Applicable', 119 'Right', 120 'Left', 121 'Visual_Order_Left', 122 'Left_And_Right', 123 'Top', 124 'Bottom', 125 'Top_And_Bottom', 126 'Top_And_Right', 127 'Top_And_Left', 128 'Top_And_Left_And_Right', 129 'Bottom_And_Left', 130 'Bottom_And_Right', 131 'Top_And_Bottom_And_Right', 132 'Overstruck', 133 ] 134 135 try: 136 basestring 137 except NameError: 138 basestring = str 139 140 class PropertyValue(object): 141 def __init__(self, name_): 142 self.name = name_ 143 def __str__(self): 144 return self.name 145 def __eq__(self, other): 146 return self.name == (other if isinstance(other, basestring) else other.name) 147 def __ne__(self, other): 148 return not (self == other) 149 def __hash__(self): 150 return hash(str(self)) 151 152 property_values = {} 153 154 for name in property_names: 155 value = PropertyValue(name) 156 assert value not in property_values 157 assert value not in globals() 158 property_values[name] = value 159 globals().update(property_values) 160 161 162 def is_BASE(U, UISC, UGC): 163 return (UISC in [Number, Consonant, Consonant_Head_Letter, 164 #SPEC-DRAFT Consonant_Placeholder, 165 Tone_Letter, 166 Vowel_Independent #SPEC-DRAFT 167 ] or 168 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 169 Consonant_Subjoined, Vowel, Vowel_Dependent])) 170 def is_BASE_IND(U, UISC, UGC): 171 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 172 return (UISC in [Consonant_Dead, Modifying_Letter] or 173 (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or 174 False # SPEC-DRAFT-OUTDATED! U == 0x002D 175 ) 176 def is_BASE_NUM(U, UISC, UGC): 177 return UISC == Brahmi_Joining_Number 178 def is_BASE_OTHER(U, UISC, UGC): 179 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 180 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 181 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 182 def is_CGJ(U, UISC, UGC): 183 return U == 0x034F 184 def is_CONS_FINAL(U, UISC, UGC): 185 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 186 return ((UISC == Consonant_Final and UGC != Lo) or 187 UISC == Consonant_Initial_Postfixed or 188 UISC == Consonant_Succeeding_Repha) 189 def is_CONS_FINAL_MOD(U, UISC, UGC): 190 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 191 return UISC == Syllable_Modifier 192 def is_CONS_MED(U, UISC, UGC): 193 return UISC == Consonant_Medial and UGC != Lo 194 def is_CONS_MOD(U, UISC, UGC): 195 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 196 def is_CONS_SUB(U, UISC, UGC): 197 #SPEC-DRAFT return UISC == Consonant_Subjoined 198 return UISC == Consonant_Subjoined and UGC != Lo 199 def is_CONS_WITH_STACKER(U, UISC, UGC): 200 return UISC == Consonant_With_Stacker 201 def is_HALANT(U, UISC, UGC): 202 return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC) 203 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC): 204 # https://github.com/harfbuzz/harfbuzz/issues/1102 205 # https://github.com/harfbuzz/harfbuzz/issues/1379 206 return U in [0x11046, 0x1134D] 207 def is_HALANT_NUM(U, UISC, UGC): 208 return UISC == Number_Joiner 209 def is_ZWNJ(U, UISC, UGC): 210 return UISC == Non_Joiner 211 def is_ZWJ(U, UISC, UGC): 212 return UISC == Joiner 213 def is_Word_Joiner(U, UISC, UGC): 214 return U == 0x2060 215 def is_OTHER(U, UISC, UGC): 216 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 217 return (UISC == Other 218 and not is_SYM_MOD(U, UISC, UGC) 219 and not is_CGJ(U, UISC, UGC) 220 and not is_Word_Joiner(U, UISC, UGC) 221 and not is_VARIATION_SELECTOR(U, UISC, UGC) 222 ) 223 def is_Reserved(U, UISC, UGC): 224 return UGC == 'Cn' 225 def is_REPHA(U, UISC, UGC): 226 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 227 def is_SYM(U, UISC, UGC): 228 if U == 0x25CC: return False #SPEC-DRAFT 229 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 230 return UGC in [So, Sc] 231 def is_SYM_MOD(U, UISC, UGC): 232 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 233 def is_VARIATION_SELECTOR(U, UISC, UGC): 234 return 0xFE00 <= U <= 0xFE0F 235 def is_VOWEL(U, UISC, UGC): 236 # https://github.com/roozbehp/unicode-data/issues/6 237 return (UISC == Pure_Killer or 238 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 239 def is_VOWEL_MOD(U, UISC, UGC): 240 # https://github.com/roozbehp/unicode-data/issues/6 241 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 242 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 243 244 use_mapping = { 245 'B': is_BASE, 246 'IND': is_BASE_IND, 247 'N': is_BASE_NUM, 248 'GB': is_BASE_OTHER, 249 'CGJ': is_CGJ, 250 'F': is_CONS_FINAL, 251 'FM': is_CONS_FINAL_MOD, 252 'M': is_CONS_MED, 253 'CM': is_CONS_MOD, 254 'SUB': is_CONS_SUB, 255 'CS': is_CONS_WITH_STACKER, 256 'H': is_HALANT, 257 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 258 'HN': is_HALANT_NUM, 259 'ZWNJ': is_ZWNJ, 260 'ZWJ': is_ZWJ, 261 'WJ': is_Word_Joiner, 262 'O': is_OTHER, 263 'Rsv': is_Reserved, 264 'R': is_REPHA, 265 'S': is_SYM, 266 'SM': is_SYM_MOD, 267 'VS': is_VARIATION_SELECTOR, 268 'V': is_VOWEL, 269 'VM': is_VOWEL_MOD, 270 } 271 272 use_positions = { 273 'F': { 274 'Abv': [Top], 275 'Blw': [Bottom], 276 'Pst': [Right], 277 }, 278 'M': { 279 'Abv': [Top], 280 'Blw': [Bottom, Bottom_And_Left], 281 'Pst': [Right], 282 'Pre': [Left], 283 }, 284 'CM': { 285 'Abv': [Top], 286 'Blw': [Bottom], 287 }, 288 'V': { 289 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 290 'Blw': [Bottom, Overstruck, Bottom_And_Right], 291 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 292 'Pre': [Left], 293 }, 294 'VM': { 295 'Abv': [Top], 296 'Blw': [Bottom, Overstruck], 297 'Pst': [Right], 298 'Pre': [Left], 299 }, 300 'SM': { 301 'Abv': [Top], 302 'Blw': [Bottom], 303 }, 304 'H': None, 305 'HVM': None, 306 'B': None, 307 'FM': None, 308 'SUB': None, 309 } 310 311 def map_to_use(data): 312 out = {} 313 items = use_mapping.items() 314 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 315 316 # Resolve Indic_Syllabic_Category 317 318 # TODO: These don't have UISC assigned in Unicode 8.0, but have UIPC 319 if U == 0x17DD: UISC = Vowel_Dependent 320 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 321 322 # Tibetan: 323 # TODO: These don't have UISC assigned in Unicode 11.0, but have UIPC 324 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 325 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark 326 # Overrides to allow NFC order matching syllable 327 # https://github.com/harfbuzz/harfbuzz/issues/1012 328 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC): 329 if UIPC == Top: 330 UIPC = Bottom 331 332 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 333 # also https://github.com/harfbuzz/harfbuzz/issues/1012 334 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC): 335 if UIPC == Top: 336 UIPC = Bottom 337 elif UIPC == Bottom: 338 UIPC = Top 339 340 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 341 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 342 343 # TODO: U+1CED should only be allowed after some of 344 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 345 if U == 0x1CED: UISC = Tone_Mark 346 347 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 348 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom 349 350 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 351 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top 352 353 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 354 if U == 0xA8B4: UISC = Consonant_Medial 355 356 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105 357 if U == 0x11134: UISC = Gemination_Mark 358 359 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1399 360 if U == 0x111C9: UISC = Consonant_Final 361 362 values = [k for k,v in items if v(U,UISC,UGC)] 363 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 364 USE = values[0] 365 366 # Resolve Indic_Positional_Category 367 368 # TODO: Not in Unicode 8.0 yet, but in spec. 369 if U == 0x1B6C: UIPC = Bottom 370 371 # TODO: These should die, but have UIPC in Unicode 8.0 372 if U in [0x953, 0x954]: UIPC = Not_Applicable 373 374 # TODO: In USE's override list but not in Unicode 11.0 375 if U == 0x103C: UIPC = Left 376 377 # TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0 378 if 0xA926 <= U <= 0xA92A: UIPC = Top 379 if U == 0x111CA: UIPC = Bottom 380 if U == 0x11300: UIPC = Top 381 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 382 if U == 0x11302: UIPC = Top 383 if U == 0x1133C: UIPC = Bottom 384 if U == 0x1171E: UIPC = Left # Correct?! 385 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 386 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 387 # https://github.com/roozbehp/unicode-data/issues/8 388 if U == 0x0A51: UIPC = Bottom 389 390 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 391 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 392 393 pos_mapping = use_positions.get(USE, None) 394 if pos_mapping: 395 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 396 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 397 USE = USE + values[0] 398 399 out[U] = (USE, UBlock) 400 return out 401 402 defaults = ('O', 'No_Block') 403 data = map_to_use(data) 404 405 print ("/* == Start of generated table == */") 406 print ("/*") 407 print (" * The following table is generated by running:") 408 print (" *") 409 print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") 410 print (" *") 411 print (" * on files with these headers:") 412 print (" *") 413 for h in headers: 414 for l in h: 415 print (" * %s" % (l.strip())) 416 print (" */") 417 print () 418 print ('#include "hb-ot-shape-complex-use.hh"') 419 print () 420 421 total = 0 422 used = 0 423 last_block = None 424 def print_block (block, start, end, data): 425 global total, used, last_block 426 if block and block != last_block: 427 print () 428 print () 429 print (" /* %s */" % block) 430 if start % 16: 431 print (' ' * (20 + (start % 16 * 6)), end='') 432 num = 0 433 assert start % 8 == 0 434 assert (end+1) % 8 == 0 435 for u in range (start, end+1): 436 if u % 16 == 0: 437 print () 438 print (" /* %04X */" % u, end='') 439 if u in data: 440 num += 1 441 d = data.get (u, defaults) 442 print ("%6s," % d[0], end='') 443 444 total += end - start + 1 445 used += num 446 if block: 447 last_block = block 448 449 uu = sorted (data.keys ()) 450 451 last = -100000 452 num = 0 453 offset = 0 454 starts = [] 455 ends = [] 456 for k,v in sorted(use_mapping.items()): 457 if k in use_positions and use_positions[k]: continue 458 print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) 459 for k,v in sorted(use_positions.items()): 460 if not v: continue 461 for suf in v.keys(): 462 tag = k + suf 463 print ("#define %s USE_%s" % (tag, tag)) 464 print ("") 465 print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") 466 for u in uu: 467 if u <= last: 468 continue 469 block = data[u][1] 470 471 start = u//8*8 472 end = start+1 473 while end in uu and block == data[end][1]: 474 end += 1 475 end = (end-1)//8*8 + 7 476 477 if start != last + 1: 478 if start - last <= 1+16*3: 479 print_block (None, last+1, start-1, data) 480 last = start-1 481 else: 482 if last >= 0: 483 ends.append (last + 1) 484 offset += ends[-1] - starts[-1] 485 print () 486 print () 487 print ("#define use_offset_0x%04xu %d" % (start, offset)) 488 starts.append (start) 489 490 print_block (block, start, end, data) 491 last = end 492 ends.append (last + 1) 493 offset += ends[-1] - starts[-1] 494 print () 495 print () 496 occupancy = used * 100. / total 497 page_bits = 12 498 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 499 print () 500 print ("USE_TABLE_ELEMENT_TYPE") 501 print ("hb_use_get_category (hb_codepoint_t u)") 502 print ("{") 503 print (" switch (u >> %d)" % page_bits) 504 print (" {") 505 pages = set([u>>page_bits for u in starts+ends]) 506 for p in sorted(pages): 507 print (" case 0x%0Xu:" % p) 508 for (start,end) in zip (starts, ends): 509 if p not in [start>>page_bits, end>>page_bits]: continue 510 offset = "use_offset_0x%04xu" % start 511 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 512 print (" break;") 513 print ("") 514 print (" default:") 515 print (" break;") 516 print (" }") 517 print (" return USE_O;") 518 print ("}") 519 print () 520 for k in sorted(use_mapping.keys()): 521 if k in use_positions and use_positions[k]: continue 522 print ("#undef %s" % k) 523 for k,v in sorted(use_positions.items()): 524 if not v: continue 525 for suf in v.keys(): 526 tag = k + suf 527 print ("#undef %s" % tag) 528 print () 529 print ("/* == End of generated table == */") 530 531 # Maintain at least 50% occupancy in the table */ 532 if occupancy < 50: 533 raise Exception ("Table too sparse, please investigate: ", occupancy) 534