1 #!/usr/bin/python 2 3 import sys 4 5 if len (sys.argv) != 5: 6 print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 7 sys.exit (1) 8 9 BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] 10 11 files = [file (x) for x in sys.argv[1:]] 12 13 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 14 headers.append (["UnicodeData.txt does not have a header."]) 15 16 data = [{} for f in files] 17 values = [{} for f in files] 18 for i, f in enumerate (files): 19 for line in f: 20 21 j = line.find ('#') 22 if j >= 0: 23 line = line[:j] 24 25 fields = [x.strip () for x in line.split (';')] 26 if len (fields) == 1: 27 continue 28 29 uu = fields[0].split ('..') 30 start = int (uu[0], 16) 31 if len (uu) == 1: 32 end = start 33 else: 34 end = int (uu[1], 16) 35 36 t = fields[1 if i != 2 else 2] 37 38 for u in range (start, end + 1): 39 data[i][u] = t 40 values[i][t] = values[i].get (t, 0) + end - start + 1 41 42 defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 43 44 # TODO Characters that are not in Unicode Indic files, but used in USE 45 data[0][0x034F] = defaults[0] 46 data[0][0x2060] = defaults[0] 47 data[0][0x20F0] = defaults[0] 48 for u in range (0xFE00, 0xFE0F + 1): 49 data[0][u] = defaults[0] 50 51 # Merge data into one dict: 52 for i,v in enumerate (defaults): 53 values[i][v] = values[i].get (v, 0) + 1 54 combined = {} 55 for i,d in enumerate (data): 56 for u,v in d.items (): 57 if i >= 2 and not u in combined: 58 continue 59 if not u in combined: 60 combined[u] = list (defaults) 61 combined[u][i] = v 62 combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 63 data = combined 64 del combined 65 num = len (data) 66 67 68 property_names = [ 69 # General_Category 70 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 71 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 72 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 73 # Indic_Syllabic_Category 74 'Other', 75 'Bindu', 76 'Visarga', 77 'Avagraha', 78 'Nukta', 79 'Virama', 80 'Pure_Killer', 81 'Invisible_Stacker', 82 'Vowel_Independent', 83 'Vowel_Dependent', 84 'Vowel', 85 'Consonant_Placeholder', 86 'Consonant', 87 'Consonant_Dead', 88 'Consonant_With_Stacker', 89 'Consonant_Prefixed', 90 'Consonant_Preceding_Repha', 91 'Consonant_Succeeding_Repha', 92 'Consonant_Subjoined', 93 'Consonant_Medial', 94 'Consonant_Final', 95 'Consonant_Head_Letter', 96 'Modifying_Letter', 97 'Tone_Letter', 98 'Tone_Mark', 99 'Gemination_Mark', 100 'Cantillation_Mark', 101 'Register_Shifter', 102 'Syllable_Modifier', 103 'Consonant_Killer', 104 'Non_Joiner', 105 'Joiner', 106 'Number_Joiner', 107 'Number', 108 'Brahmi_Joining_Number', 109 # Indic_Positional_Category 110 'Not_Applicable', 111 'Right', 112 'Left', 113 'Visual_Order_Left', 114 'Left_And_Right', 115 'Top', 116 'Bottom', 117 'Top_And_Bottom', 118 'Top_And_Right', 119 'Top_And_Left', 120 'Top_And_Left_And_Right', 121 'Bottom_And_Left', 122 'Bottom_And_Right', 123 'Top_And_Bottom_And_Right', 124 'Overstruck', 125 ] 126 127 class PropertyValue(object): 128 def __init__(self, name_): 129 self.name = name_ 130 def __str__(self): 131 return self.name 132 def __eq__(self, other): 133 return self.name == (other if isinstance(other, basestring) else other.name) 134 def __ne__(self, other): 135 return not (self == other) 136 137 property_values = {} 138 139 for name in property_names: 140 value = PropertyValue(name) 141 assert value not in property_values 142 assert value not in globals() 143 property_values[name] = value 144 globals().update(property_values) 145 146 147 def is_BASE(U, UISC, UGC): 148 return (UISC in [Number, Consonant, Consonant_Head_Letter, 149 #SPEC-DRAFT Consonant_Placeholder, 150 Tone_Letter, 151 Vowel_Independent #SPEC-DRAFT 152 ] or 153 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 154 Consonant_Subjoined, Vowel, Vowel_Dependent])) 155 def is_BASE_IND(U, UISC, UGC): 156 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 157 return (UISC in [Consonant_Dead, Modifying_Letter] or 158 (UGC == Po and not U in [0x104E, 0x2022, 0x11A3F, 0x11A45]) or 159 False # SPEC-DRAFT-OUTDATED! U == 0x002D 160 ) 161 def is_BASE_NUM(U, UISC, UGC): 162 return UISC == Brahmi_Joining_Number 163 def is_BASE_OTHER(U, UISC, UGC): 164 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 165 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 166 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 167 def is_CGJ(U, UISC, UGC): 168 return U == 0x034F 169 def is_CONS_FINAL(U, UISC, UGC): 170 return ((UISC == Consonant_Final and UGC != Lo) or 171 UISC == Consonant_Succeeding_Repha) 172 def is_CONS_FINAL_MOD(U, UISC, UGC): 173 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 174 return UISC == Syllable_Modifier 175 def is_CONS_MED(U, UISC, UGC): 176 return UISC == Consonant_Medial and UGC != Lo 177 def is_CONS_MOD(U, UISC, UGC): 178 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 179 def is_CONS_SUB(U, UISC, UGC): 180 #SPEC-DRAFT return UISC == Consonant_Subjoined 181 return UISC == Consonant_Subjoined and UGC != Lo 182 def is_CONS_WITH_STACKER(U, UISC, UGC): 183 return UISC == Consonant_With_Stacker 184 def is_HALANT(U, UISC, UGC): 185 return UISC in [Virama, Invisible_Stacker] 186 def is_HALANT_NUM(U, UISC, UGC): 187 return UISC == Number_Joiner 188 def is_ZWNJ(U, UISC, UGC): 189 return UISC == Non_Joiner 190 def is_ZWJ(U, UISC, UGC): 191 return UISC == Joiner 192 def is_Word_Joiner(U, UISC, UGC): 193 return U == 0x2060 194 def is_OTHER(U, UISC, UGC): 195 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 196 return (UISC == Other 197 and not is_SYM_MOD(U, UISC, UGC) 198 and not is_CGJ(U, UISC, UGC) 199 and not is_Word_Joiner(U, UISC, UGC) 200 and not is_VARIATION_SELECTOR(U, UISC, UGC) 201 ) 202 def is_Reserved(U, UISC, UGC): 203 return UGC == 'Cn' 204 def is_REPHA(U, UISC, UGC): 205 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 206 def is_SYM(U, UISC, UGC): 207 if U == 0x25CC: return False #SPEC-DRAFT 208 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 209 return UGC in [So, Sc] 210 def is_SYM_MOD(U, UISC, UGC): 211 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 212 def is_VARIATION_SELECTOR(U, UISC, UGC): 213 return 0xFE00 <= U <= 0xFE0F 214 def is_VOWEL(U, UISC, UGC): 215 # https://github.com/roozbehp/unicode-data/issues/6 216 return (UISC == Pure_Killer or 217 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 218 def is_VOWEL_MOD(U, UISC, UGC): 219 # https://github.com/roozbehp/unicode-data/issues/6 220 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 221 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 222 223 use_mapping = { 224 'B': is_BASE, 225 'IND': is_BASE_IND, 226 'N': is_BASE_NUM, 227 'GB': is_BASE_OTHER, 228 'CGJ': is_CGJ, 229 'F': is_CONS_FINAL, 230 'FM': is_CONS_FINAL_MOD, 231 'M': is_CONS_MED, 232 'CM': is_CONS_MOD, 233 'SUB': is_CONS_SUB, 234 'CS': is_CONS_WITH_STACKER, 235 'H': is_HALANT, 236 'HN': is_HALANT_NUM, 237 'ZWNJ': is_ZWNJ, 238 'ZWJ': is_ZWJ, 239 'WJ': is_Word_Joiner, 240 'O': is_OTHER, 241 'Rsv': is_Reserved, 242 'R': is_REPHA, 243 'S': is_SYM, 244 'SM': is_SYM_MOD, 245 'VS': is_VARIATION_SELECTOR, 246 'V': is_VOWEL, 247 'VM': is_VOWEL_MOD, 248 } 249 250 use_positions = { 251 'F': { 252 'Abv': [Top], 253 'Blw': [Bottom], 254 'Pst': [Right], 255 }, 256 'M': { 257 'Abv': [Top], 258 'Blw': [Bottom, Bottom_And_Left], 259 'Pst': [Right], 260 'Pre': [Left], 261 }, 262 'CM': { 263 'Abv': [Top], 264 'Blw': [Bottom], 265 }, 266 'V': { 267 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 268 'Blw': [Bottom, Overstruck, Bottom_And_Right], 269 'Pst': [Right], 270 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 271 }, 272 'VM': { 273 'Abv': [Top], 274 'Blw': [Bottom, Overstruck], 275 'Pst': [Right], 276 'Pre': [Left], 277 }, 278 'SM': { 279 'Abv': [Top], 280 'Blw': [Bottom], 281 }, 282 'H': None, 283 'B': None, 284 'FM': None, 285 'SUB': None, 286 } 287 288 def map_to_use(data): 289 out = {} 290 items = use_mapping.items() 291 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 292 293 # Resolve Indic_Syllabic_Category 294 295 # TODO: These don't have UISC assigned in Unicode 8.0, but 296 # have UIPC 297 if U == 0x17DD: UISC = Vowel_Dependent 298 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 299 300 # TODO: U+1CED should only be allowed after some of 301 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 302 if U == 0x1CED: UISC = Tone_Mark 303 304 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 305 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom 306 307 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 308 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top 309 310 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 311 if U == 0xA8B4: UISC = Consonant_Medial 312 313 values = [k for k,v in items if v(U,UISC,UGC)] 314 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 315 USE = values[0] 316 317 # Resolve Indic_Positional_Category 318 319 # TODO: Not in Unicode 8.0 yet, but in spec. 320 if U == 0x1B6C: UIPC = Bottom 321 322 # TODO: These should die, but have UIPC in Unicode 8.0 323 if U in [0x953, 0x954]: UIPC = Not_Applicable 324 325 # TODO: In USE's override list but not in Unicode 8.0 326 if U == 0x103C: UIPC = Left 327 328 # TODO: These are not in USE's override list that we have, nor are they in Unicode 8.0 329 if 0xA926 <= U <= 0xA92A: UIPC = Top 330 if U == 0x111CA: UIPC = Bottom 331 if U == 0x11300: UIPC = Top 332 if U == 0x1133C: UIPC = Bottom 333 if U == 0x1171E: UIPC = Left # Correct?! 334 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 335 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 336 337 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 338 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 339 340 pos_mapping = use_positions.get(USE, None) 341 if pos_mapping: 342 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 343 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 344 USE = USE + values[0] 345 346 out[U] = (USE, UBlock) 347 return out 348 349 defaults = ('O', 'No_Block') 350 data = map_to_use(data) 351 352 # Remove the outliers 353 singles = {} 354 for u in [0x034F, 0x25CC, 0x1107F]: 355 singles[u] = data[u] 356 del data[u] 357 358 print "/* == Start of generated table == */" 359 print "/*" 360 print " * The following table is generated by running:" 361 print " *" 362 print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" 363 print " *" 364 print " * on files with these headers:" 365 print " *" 366 for h in headers: 367 for l in h: 368 print " * %s" % (l.strip()) 369 print " */" 370 print 371 print '#include "hb-ot-shape-complex-use-private.hh"' 372 print 373 374 total = 0 375 used = 0 376 last_block = None 377 def print_block (block, start, end, data): 378 global total, used, last_block 379 if block and block != last_block: 380 print 381 print 382 print " /* %s */" % block 383 if start % 16: 384 print ' ' * (20 + (start % 16 * 6)), 385 num = 0 386 assert start % 8 == 0 387 assert (end+1) % 8 == 0 388 for u in range (start, end+1): 389 if u % 16 == 0: 390 print 391 print " /* %04X */" % u, 392 if u in data: 393 num += 1 394 d = data.get (u, defaults) 395 sys.stdout.write ("%6s," % d[0]) 396 397 total += end - start + 1 398 used += num 399 if block: 400 last_block = block 401 402 uu = data.keys () 403 uu.sort () 404 405 last = -100000 406 num = 0 407 offset = 0 408 starts = [] 409 ends = [] 410 for k,v in sorted(use_mapping.items()): 411 if k in use_positions and use_positions[k]: continue 412 print "#define %s USE_%s /* %s */" % (k, k, v.__name__[3:]) 413 for k,v in sorted(use_positions.items()): 414 if not v: continue 415 for suf in v.keys(): 416 tag = k + suf 417 print "#define %s USE_%s" % (tag, tag) 418 print "" 419 print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" 420 for u in uu: 421 if u <= last: 422 continue 423 block = data[u][1] 424 425 start = u//8*8 426 end = start+1 427 while end in uu and block == data[end][1]: 428 end += 1 429 end = (end-1)//8*8 + 7 430 431 if start != last + 1: 432 if start - last <= 1+16*3: 433 print_block (None, last+1, start-1, data) 434 last = start-1 435 else: 436 if last >= 0: 437 ends.append (last + 1) 438 offset += ends[-1] - starts[-1] 439 print 440 print 441 print "#define use_offset_0x%04xu %d" % (start, offset) 442 starts.append (start) 443 444 print_block (block, start, end, data) 445 last = end 446 ends.append (last + 1) 447 offset += ends[-1] - starts[-1] 448 print 449 print 450 occupancy = used * 100. / total 451 page_bits = 12 452 print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy) 453 print 454 print "USE_TABLE_ELEMENT_TYPE" 455 print "hb_use_get_categories (hb_codepoint_t u)" 456 print "{" 457 print " switch (u >> %d)" % page_bits 458 print " {" 459 pages = set([u>>page_bits for u in starts+ends+singles.keys()]) 460 for p in sorted(pages): 461 print " case 0x%0Xu:" % p 462 for (start,end) in zip (starts, ends): 463 if p not in [start>>page_bits, end>>page_bits]: continue 464 offset = "use_offset_0x%04xu" % start 465 print " if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset) 466 for u,d in singles.items (): 467 if p != u>>page_bits: continue 468 print " if (unlikely (u == 0x%04Xu)) return %s;" % (u, d[0]) 469 print " break;" 470 print "" 471 print " default:" 472 print " break;" 473 print " }" 474 print " return USE_O;" 475 print "}" 476 print 477 for k in sorted(use_mapping.keys()): 478 if k in use_positions and use_positions[k]: continue 479 print "#undef %s" % k 480 for k,v in sorted(use_positions.items()): 481 if not v: continue 482 for suf in v.keys(): 483 tag = k + suf 484 print "#undef %s" % tag 485 print 486 print "/* == End of generated table == */" 487 488 # Maintain at least 50% occupancy in the table */ 489 if occupancy < 50: 490 raise Exception ("Table too sparse, please investigate: ", occupancy) 491