1 #!/usr/bin/python -u 2 # 3 # Original script modified in November 2003 to take advantage of 4 # the character-validation range routines, and updated to the 5 # current Unicode information (Version 4.0.1) 6 # 7 # NOTE: there is an 'alias' facility for blocks which are not present in 8 # the current release, but are needed for ABI compatibility. This 9 # must be accomplished MANUALLY! Please see the comments below under 10 # 'blockAliases' 11 # 12 import sys 13 import string 14 import time 15 16 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" 17 sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" 18 19 # 20 # blockAliases is a small hack - it is used for mapping block names which 21 # were were used in the 3.1 release, but are missing or changed in the current 22 # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" 23 blockAliases = [] 24 blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") 25 blockAliases.append("Greek:GreekandCoptic") 26 blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 27 "SupplementaryPrivateUseArea-B") 28 29 # minTableSize gives the minimum number of ranges which must be present 30 # before a range table is produced. If there are less than this 31 # number, inline comparisons are generated 32 minTableSize = 8 33 34 (blockfile, catfile) = string.split(sources) 35 36 37 # 38 # Now process the "blocks" file, reducing it to a dictionary 39 # indexed by blockname, containing a tuple with the applicable 40 # block range 41 # 42 BlockNames = {} 43 try: 44 blocks = open(blockfile, "r") 45 except: 46 print "Missing %s, aborting ..." % blockfile 47 sys.exit(1) 48 49 for line in blocks.readlines(): 50 if line[0] == '#': 51 continue 52 line = string.strip(line) 53 if line == '': 54 continue 55 try: 56 fields = string.split(line, ';') 57 range = string.strip(fields[0]) 58 (start, end) = string.split(range, "..") 59 name = string.strip(fields[1]) 60 name = string.replace(name, ' ', '') 61 except: 62 print "Failed to process line: %s" % (line) 63 continue 64 start = "0x" + start 65 end = "0x" + end 66 try: 67 BlockNames[name].append((start, end)) 68 except: 69 BlockNames[name] = [(start, end)] 70 blocks.close() 71 print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) 72 73 for block in blockAliases: 74 alias = string.split(block,':') 75 alist = string.split(alias[1],',') 76 for comp in alist: 77 if BlockNames.has_key(comp): 78 if alias[0] not in BlockNames: 79 BlockNames[alias[0]] = [] 80 for r in BlockNames[comp]: 81 BlockNames[alias[0]].append(r) 82 else: 83 print "Alias %s: %s not in Blocks" % (alias[0], comp) 84 continue 85 86 # 87 # Next process the Categories file. This is more complex, since 88 # the file is in code sequence, and we need to invert it. We use 89 # a dictionary with index category-name, with each entry containing 90 # all the ranges (codepoints) of that category. Note that category 91 # names comprise two parts - the general category, and the "subclass" 92 # within that category. Therefore, both "general category" (which is 93 # the first character of the 2-character category-name) and the full 94 # (2-character) name are entered into this dictionary. 95 # 96 try: 97 data = open(catfile, "r") 98 except: 99 print "Missing %s, aborting ..." % catfile 100 sys.exit(1) 101 102 nbchar = 0; 103 Categories = {} 104 for line in data.readlines(): 105 if line[0] == '#': 106 continue 107 line = string.strip(line) 108 if line == '': 109 continue 110 try: 111 fields = string.split(line, ';') 112 point = string.strip(fields[0]) 113 value = 0 114 while point != '': 115 value = value * 16 116 if point[0] >= '0' and point[0] <= '9': 117 value = value + ord(point[0]) - ord('0') 118 elif point[0] >= 'A' and point[0] <= 'F': 119 value = value + 10 + ord(point[0]) - ord('A') 120 elif point[0] >= 'a' and point[0] <= 'f': 121 value = value + 10 + ord(point[0]) - ord('a') 122 point = point[1:] 123 name = fields[2] 124 except: 125 print "Failed to process line: %s" % (line) 126 continue 127 128 nbchar = nbchar + 1 129 # update entry for "full name" 130 try: 131 Categories[name].append(value) 132 except: 133 try: 134 Categories[name] = [value] 135 except: 136 print "Failed to process line: %s" % (line) 137 # update "general category" name 138 try: 139 Categories[name[0]].append(value) 140 except: 141 try: 142 Categories[name[0]] = [value] 143 except: 144 print "Failed to process line: %s" % (line) 145 146 blocks.close() 147 print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) 148 149 # 150 # The data is now all read. Time to process it into a more useful form. 151 # 152 # reduce the number list into ranges 153 for cat in Categories.keys(): 154 list = Categories[cat] 155 start = -1 156 prev = -1 157 end = -1 158 ranges = [] 159 for val in list: 160 if start == -1: 161 start = val 162 prev = val 163 continue 164 elif val == prev + 1: 165 prev = val 166 continue 167 elif prev == start: 168 ranges.append((prev, prev)) 169 start = val 170 prev = val 171 continue 172 else: 173 ranges.append((start, prev)) 174 start = val 175 prev = val 176 continue 177 if prev == start: 178 ranges.append((prev, prev)) 179 else: 180 ranges.append((start, prev)) 181 Categories[cat] = ranges 182 183 # 184 # Assure all data is in alphabetic order, since we will be doing binary 185 # searches on the tables. 186 # 187 bkeys = BlockNames.keys() 188 bkeys.sort() 189 190 ckeys = Categories.keys() 191 ckeys.sort() 192 193 # 194 # Generate the resulting files 195 # 196 try: 197 header = open("include/libxml/xmlunicode.h", "w") 198 except: 199 print "Failed to open include/libxml/xmlunicode.h" 200 sys.exit(1) 201 202 try: 203 output = open("xmlunicode.c", "w") 204 except: 205 print "Failed to open xmlunicode.c" 206 sys.exit(1) 207 208 date = time.asctime(time.localtime(time.time())) 209 210 header.write( 211 """/* 212 * Summary: Unicode character APIs 213 * Description: API for the Unicode character APIs 214 * 215 * This file is automatically generated from the 216 * UCS description files of the Unicode Character Database 217 * %s 218 * using the genUnicode.py Python script. 219 * 220 * Generation date: %s 221 * Sources: %s 222 * Author: Daniel Veillard 223 */ 224 225 #ifndef __XML_UNICODE_H__ 226 #define __XML_UNICODE_H__ 227 228 #include <libxml/xmlversion.h> 229 230 #ifdef LIBXML_UNICODE_ENABLED 231 232 #ifdef __cplusplus 233 extern "C" { 234 #endif 235 236 """ % (webpage, date, sources)); 237 238 output.write( 239 """/* 240 * xmlunicode.c: this module implements the Unicode character APIs 241 * 242 * This file is automatically generated from the 243 * UCS description files of the Unicode Character Database 244 * %s 245 * using the genUnicode.py Python script. 246 * 247 * Generation date: %s 248 * Sources: %s 249 * Daniel Veillard <veillard (at] redhat.com> 250 */ 251 252 #define IN_LIBXML 253 #include "libxml.h" 254 255 #ifdef LIBXML_UNICODE_ENABLED 256 257 #include <string.h> 258 #include <libxml/xmlversion.h> 259 #include <libxml/xmlunicode.h> 260 #include <libxml/chvalid.h> 261 262 typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ 263 264 typedef struct { 265 const char *rangename; 266 xmlIntFunc *func; 267 } xmlUnicodeRange; 268 269 typedef struct { 270 const xmlUnicodeRange *table; 271 int numentries; 272 } xmlUnicodeNameTable; 273 274 275 static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname); 276 277 static const xmlUnicodeRange xmlUnicodeBlocks[] = { 278 """ % (webpage, date, sources)); 279 280 flag = 0 281 for block in bkeys: 282 name = string.replace(block, '-', '') 283 if flag: 284 output.write(',\n') 285 else: 286 flag = 1 287 output.write(' {"%s", xmlUCSIs%s}' % (block, name)) 288 output.write('};\n\n') 289 290 output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n') 291 flag = 0; 292 for name in ckeys: 293 if flag: 294 output.write(',\n') 295 else: 296 flag = 1 297 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) 298 output.write('};\n\n') 299 300 # 301 # For any categories with more than minTableSize ranges we generate 302 # a range table suitable for xmlCharInRange 303 # 304 for name in ckeys: 305 if len(Categories[name]) > minTableSize: 306 numshort = 0 307 numlong = 0 308 ranges = Categories[name] 309 sptr = "NULL" 310 lptr = "NULL" 311 for range in ranges: 312 (low, high) = range 313 if high < 0x10000: 314 if numshort == 0: 315 pline = "static const xmlChSRange xml%sS[] = {" % name 316 sptr = "xml%sS" % name 317 else: 318 pline += ", " 319 numshort += 1 320 else: 321 if numlong == 0: 322 if numshort > 0: 323 output.write(pline + " };\n") 324 pline = "static const xmlChLRange xml%sL[] = {" % name 325 lptr = "xml%sL" % name 326 else: 327 pline += ", " 328 numlong += 1 329 if len(pline) > 60: 330 output.write(pline + "\n") 331 pline = " " 332 pline += "{%s, %s}" % (hex(low), hex(high)) 333 output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" 334 % (name, numshort, numlong, sptr, lptr)) 335 336 337 output.write( 338 """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; 339 static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; 340 341 /** 342 * xmlUnicodeLookup: 343 * @tptr: pointer to the name table 344 * @name: name to be found 345 * 346 * binary table lookup for user-supplied name 347 * 348 * Returns pointer to range function if found, otherwise NULL 349 */ 350 static xmlIntFunc 351 *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) { 352 int low, high, mid, cmp; 353 xmlUnicodeRange *sptr; 354 355 if ((tptr == NULL) || (tname == NULL)) return(NULL); 356 357 low = 0; 358 high = tptr->numentries - 1; 359 sptr = tptr->table; 360 while (low <= high) { 361 mid = (low + high) / 2; 362 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) 363 return (sptr[mid].func); 364 if (cmp < 0) 365 high = mid - 1; 366 else 367 low = mid + 1; 368 } 369 return (NULL); 370 } 371 372 """ % (len(BlockNames), len(Categories)) ) 373 374 for block in bkeys: 375 name = string.replace(block, '-', '') 376 header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name) 377 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) 378 output.write(" *\n * Check whether the character is part of %s UCS Block\n"% 379 (block)) 380 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 381 output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) 382 flag = 0 383 for (start, end) in BlockNames[block]: 384 if flag: 385 output.write(" ||\n ") 386 else: 387 flag = 1 388 output.write("((code >= %s) && (code <= %s))" % (start, end)) 389 output.write(");\n}\n\n") 390 391 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n") 392 output.write( 393 """/** 394 * xmlUCSIsBlock: 395 * @code: UCS code point 396 * @block: UCS block name 397 * 398 * Check whether the character is part of the UCS Block 399 * 400 * Returns 1 if true, 0 if false and -1 on unknown block 401 */ 402 int 403 xmlUCSIsBlock(int code, const char *block) { 404 xmlIntFunc *func; 405 406 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); 407 if (func == NULL) 408 return (-1); 409 return (func(code)); 410 } 411 412 """) 413 414 for name in ckeys: 415 ranges = Categories[name] 416 header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name) 417 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) 418 output.write(" *\n * Check whether the character is part of %s UCS Category\n"% 419 (name)) 420 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 421 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) 422 if len(Categories[name]) > minTableSize: 423 output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" 424 % name) 425 else: 426 start = 1 427 for range in ranges: 428 (begin, end) = range; 429 if start: 430 output.write(" return("); 431 start = 0 432 else: 433 output.write(" ||\n "); 434 if (begin == end): 435 output.write("(code == %s)" % (hex(begin))) 436 else: 437 output.write("((code >= %s) && (code <= %s))" % ( 438 hex(begin), hex(end))) 439 output.write(");\n}\n\n") 440 441 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n") 442 output.write( 443 """/** 444 * xmlUCSIsCat: 445 * @code: UCS code point 446 * @cat: UCS Category name 447 * 448 * Check whether the character is part of the UCS Category 449 * 450 * Returns 1 if true, 0 if false and -1 on unknown category 451 */ 452 int 453 xmlUCSIsCat(int code, const char *cat) { 454 xmlIntFunc *func; 455 456 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); 457 if (func == NULL) 458 return (-1); 459 return (func(code)); 460 } 461 462 #define bottom_xmlunicode 463 #include "elfgcchack.h" 464 #endif /* LIBXML_UNICODE_ENABLED */ 465 """) 466 467 header.write(""" 468 #ifdef __cplusplus 469 } 470 #endif 471 472 #endif /* LIBXML_UNICODE_ENABLED */ 473 474 #endif /* __XML_UNICODE_H__ */ 475 """); 476 477 header.close() 478 output.close() 479