1 #!/usr/bin/python -u 2 # 3 # Portions of this script have been (shamelessly) stolen from the 4 # prior work of Daniel Veillard (genUnicode.py) 5 # 6 # I, however, take full credit for any bugs, errors or difficulties :-) 7 # 8 # William Brack 9 # October 2003 10 # 11 # 18 October 2003 12 # Modified to maintain binary compatibility with previous library versions 13 # by adding a suffix 'Q' ('quick') to the macro generated for the original, 14 # function, and adding generation of a function (with the original name) which 15 # instantiates the macro. 16 # 17 18 import sys 19 import string 20 import time 21 22 # 23 # A routine to take a list of yes/no (1, 0) values and turn it 24 # into a list of ranges. This will later be used to determine whether 25 # to generate single-byte lookup tables, or inline comparisons 26 # 27 def makeRange(lst): 28 ret = [] 29 pos = 0 30 while pos < len(lst): 31 try: # index generates exception if not present 32 s = lst[pos:].index(1) # look for start of next range 33 except: 34 break # if no more, finished 35 pos += s # pointer to start of possible range 36 try: 37 e = lst[pos:].index(0) # look for end of range 38 e += pos 39 except: # if no end, set to end of list 40 e = len(lst) 41 ret.append((pos, e-1)) # append range tuple to list 42 pos = e + 1 # ready to check for next range 43 return ret 44 45 sources = "chvalid.def" # input filename 46 47 # minTableSize gives the minimum number of ranges which must be present 48 # before a 256-byte lookup table is produced. If there are less than this 49 # number, a macro with inline comparisons is generated 50 minTableSize = 6 51 52 # dictionary of functions, key=name, element contains char-map and range-list 53 Functs = {} 54 55 state = 0 56 57 try: 58 defines = open("chvalid.def", "r") 59 except: 60 print "Missing chvalid.def, aborting ..." 61 sys.exit(1) 62 63 # 64 # The lines in the .def file have three types:- 65 # name: Defines a new function block 66 # ur: Defines individual or ranges of unicode values 67 # end: Indicates the end of the function block 68 # 69 # These lines are processed below. 70 # 71 for line in defines.readlines(): 72 # ignore blank lines, or lines beginning with '#' 73 if line[0] == '#': 74 continue 75 line = string.strip(line) 76 if line == '': 77 continue 78 # split line into space-separated fields, then split on type 79 try: 80 fields = string.split(line, ' ') 81 # 82 # name line: 83 # validate any previous function block already ended 84 # validate this function not already defined 85 # initialize an entry in the function dicitonary 86 # including a mask table with no values yet defined 87 # 88 if fields[0] == 'name': 89 name = fields[1] 90 if state != 0: 91 print "'name' %s found before previous name" \ 92 "completed" % (fields[1]) 93 continue 94 state = 1 95 if Functs.has_key(name): 96 print "name '%s' already present - may give" \ 97 " wrong results" % (name) 98 else: 99 # dict entry with two list elements (chdata, rangedata) 100 Functs[name] = [ [], [] ] 101 for v in range(256): 102 Functs[name][0].append(0) 103 # 104 # end line: 105 # validate there was a preceding function name line 106 # set state to show no current function active 107 # 108 elif fields[0] == 'end': 109 if state == 0: 110 print "'end' found outside of function block" 111 continue 112 state = 0 113 114 # 115 # ur line: 116 # validate function has been defined 117 # process remaining fields on the line, which may be either 118 # individual unicode values or ranges of values 119 # 120 elif fields[0] == 'ur': 121 if state != 1: 122 raise ValidationError, "'ur' found outside of 'name' block" 123 for el in fields[1:]: 124 pos = string.find(el, '..') 125 # pos <=0 means not a range, so must be individual value 126 if pos <= 0: 127 # cheap handling of hex or decimal values 128 if el[0:2] == '0x': 129 value = int(el[2:],16) 130 elif el[0] == "'": 131 value = ord(el[1]) 132 else: 133 value = int(el) 134 if ((value < 0) | (value > 0x1fffff)): 135 raise ValidationError, 'Illegal value (%s) in ch for'\ 136 ' name %s' % (el,name) 137 # for ur we have only ranges (makes things simpler), 138 # so convert val to range 139 currange = (value, value) 140 # pos > 0 means this is a range, so isolate/validate 141 # the interval 142 else: 143 # split the range into it's first-val, last-val 144 (first, last) = string.split(el, "..") 145 # convert values from text into binary 146 if first[0:2] == '0x': 147 start = int(first[2:],16) 148 elif first[0] == "'": 149 start = ord(first[1]) 150 else: 151 start = int(first) 152 if last[0:2] == '0x': 153 end = int(last[2:],16) 154 elif last[0] == "'": 155 end = ord(last[1]) 156 else: 157 end = int(last) 158 if (start < 0) | (end > 0x1fffff) | (start > end): 159 raise ValidationError, "Invalid range '%s'" % el 160 currange = (start, end) 161 # common path - 'currange' has the range, now take care of it 162 # We split on single-byte values vs. multibyte 163 if currange[1] < 0x100: # single-byte 164 for ch in range(currange[0],currange[1]+1): 165 # validate that value not previously defined 166 if Functs[name][0][ch]: 167 msg = "Duplicate ch value '%s' for name '%s'" % (el, name) 168 raise ValidationError, msg 169 Functs[name][0][ch] = 1 170 else: # multi-byte 171 if currange in Functs[name][1]: 172 raise ValidationError, "range already defined in" \ 173 " function" 174 else: 175 Functs[name][1].append(currange) 176 177 except: 178 print "Failed to process line: %s" % (line) 179 raise 180 # 181 # At this point, the entire definition file has been processed. Now we 182 # enter the output phase, where we generate the two files chvalid.c and' 183 # chvalid.h 184 # 185 # To do this, we first output the 'static' data (heading, fixed 186 # definitions, etc.), then output the 'dynamic' data (the results 187 # of the above processing), and finally output closing 'static' data 188 # (e.g. the subroutine to process the ranges) 189 # 190 191 # 192 # Generate the headings: 193 # 194 try: 195 header = open("include/libxml/chvalid.h", "w") 196 except: 197 print "Failed to open include/libxml/chvalid.h" 198 sys.exit(1) 199 200 try: 201 output = open("chvalid.c", "w") 202 except: 203 print "Failed to open chvalid.c" 204 sys.exit(1) 205 206 date = time.asctime(time.localtime(time.time())) 207 208 header.write( 209 """/* 210 * Summary: Unicode character range checking 211 * Description: this module exports interfaces for the character 212 * range validation APIs 213 * 214 * This file is automatically generated from the cvs source 215 * definition files using the genChRanges.py Python script 216 * 217 * Generation date: %s 218 * Sources: %s 219 * Author: William Brack <wbrack (at] mmm.com.hk> 220 */ 221 222 #ifndef __XML_CHVALID_H__ 223 #define __XML_CHVALID_H__ 224 225 #include <libxml/xmlversion.h> 226 #include <libxml/xmlstring.h> 227 228 #ifdef __cplusplus 229 extern "C" { 230 #endif 231 232 /* 233 * Define our typedefs and structures 234 * 235 */ 236 typedef struct _xmlChSRange xmlChSRange; 237 typedef xmlChSRange *xmlChSRangePtr; 238 struct _xmlChSRange { 239 unsigned short low; 240 unsigned short high; 241 }; 242 243 typedef struct _xmlChLRange xmlChLRange; 244 typedef xmlChLRange *xmlChLRangePtr; 245 struct _xmlChLRange { 246 unsigned int low; 247 unsigned int high; 248 }; 249 250 typedef struct _xmlChRangeGroup xmlChRangeGroup; 251 typedef xmlChRangeGroup *xmlChRangeGroupPtr; 252 struct _xmlChRangeGroup { 253 int nbShortRange; 254 int nbLongRange; 255 const xmlChSRange *shortRange; /* points to an array of ranges */ 256 const xmlChLRange *longRange; 257 }; 258 259 /** 260 * Range checking routine 261 */ 262 XMLPUBFUN int XMLCALL 263 xmlCharInRange(unsigned int val, const xmlChRangeGroup *group); 264 265 """ % (date, sources)); 266 output.write( 267 """/* 268 * chvalid.c: this module implements the character range 269 * validation APIs 270 * 271 * This file is automatically generated from the cvs source 272 * definition files using the genChRanges.py Python script 273 * 274 * Generation date: %s 275 * Sources: %s 276 * William Brack <wbrack (at] mmm.com.hk> 277 */ 278 279 #define IN_LIBXML 280 #include "libxml.h" 281 #include <libxml/chvalid.h> 282 283 /* 284 * The initial tables ({func_name}_tab) are used to validate whether a 285 * single-byte character is within the specified group. Each table 286 * contains 256 bytes, with each byte representing one of the 256 287 * possible characters. If the table byte is set, the character is 288 * allowed. 289 * 290 */ 291 """ % (date, sources)); 292 293 # 294 # Now output the generated data. 295 # We try to produce the best execution times. Tests have shown that validation 296 # with direct table lookup is, when there are a "small" number of valid items, 297 # still not as fast as a sequence of inline compares. So, if the single-byte 298 # portion of a range has a "small" number of ranges, we output a macro for inline 299 # compares, otherwise we output a 256-byte table and a macro to use it. 300 # 301 302 fkeys = Functs.keys() # Dictionary of all defined functions 303 fkeys.sort() # Put some order to our output 304 305 for f in fkeys: 306 307 # First we convert the specified single-byte values into a group of ranges. 308 # If the total number of such ranges is less than minTableSize, we generate 309 # an inline macro for direct comparisons; if greater, we generate a lookup 310 # table. 311 if max(Functs[f][0]) > 0: # only check if at least one entry 312 rangeTable = makeRange(Functs[f][0]) 313 numRanges = len(rangeTable) 314 if numRanges >= minTableSize: # table is worthwhile 315 header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f) 316 header.write(""" 317 /** 318 * %s_ch: 319 * @c: char to validate 320 * 321 * Automatically generated by genChRanges.py 322 */ 323 """ % f) 324 header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f)) 325 326 # write the constant data to the code file 327 output.write("const unsigned char %s_tab[256] = {\n" % f) 328 pline = " " 329 for n in range(255): 330 pline += " 0x%02x," % Functs[f][0][n] 331 if len(pline) > 72: 332 output.write(pline + "\n") 333 pline = " " 334 output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255]) 335 336 else: # inline check is used 337 # first another little optimisation - if space is present, 338 # put it at the front of the list so it is checked first 339 try: 340 ix = rangeTable.remove((0x20, 0x20)) 341 rangeTable.insert(0, (0x20, 0x20)) 342 except: 343 pass 344 firstFlag = 1 345 346 header.write(""" 347 /** 348 * %s_ch: 349 * @c: char to validate 350 * 351 * Automatically generated by genChRanges.py 352 */ 353 """ % f) 354 # okay, I'm tired of the messy lineup - let's automate it! 355 pline = "#define %s_ch(c)" % f 356 # 'ntab' is number of tabs needed to position to col. 33 from name end 357 ntab = 4 - (len(pline)) / 8 358 if ntab < 0: 359 ntab = 0 360 just = "" 361 for i in range(ntab): 362 just += "\t" 363 pline = pline + just + "(" 364 for rg in rangeTable: 365 if not firstFlag: 366 pline += " || \\\n\t\t\t\t " 367 else: 368 firstFlag = 0 369 if rg[0] == rg[1]: # single value - check equal 370 pline += "((c) == 0x%x)" % rg[0] 371 else: # value range 372 # since we are doing char, also change range ending in 0xff 373 if rg[1] != 0xff: 374 pline += "((0x%x <= (c)) &&" % rg[0] 375 pline += " ((c) <= 0x%x))" % rg[1] 376 else: 377 pline += " (0x%x <= (c))" % rg[0] 378 pline += ")\n" 379 header.write(pline) 380 381 header.write(""" 382 /** 383 * %sQ: 384 * @c: char to validate 385 * 386 * Automatically generated by genChRanges.py 387 */ 388 """ % f) 389 pline = "#define %sQ(c)" % f 390 ntab = 4 - (len(pline)) / 8 391 if ntab < 0: 392 ntab = 0 393 just = "" 394 for i in range(ntab): 395 just += "\t" 396 header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ") 397 if max(Functs[f][0]) > 0: 398 header.write("%s_ch((c)) :" % f) 399 else: 400 header.write("0 :") 401 402 # if no ranges defined, value invalid if >= 0x100 403 numRanges = len(Functs[f][1]) 404 if numRanges == 0: 405 header.write(" 0)\n\n") 406 else: 407 if numRanges >= minTableSize: 408 header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n" % f) 409 else: # if < minTableSize, generate inline code 410 firstFlag = 1 411 for rg in Functs[f][1]: 412 if not firstFlag: 413 pline += " || \\\n\t\t\t\t " 414 else: 415 firstFlag = 0 416 pline = "\\\n\t\t\t\t(" 417 if rg[0] == rg[1]: # single value - check equal 418 pline += "((c) == 0x%x)" % rg[0] 419 else: # value range 420 pline += "((0x%x <= (c)) &&" % rg[0] 421 pline += " ((c) <= 0x%x))" % rg[1] 422 pline += "))\n\n" 423 header.write(pline) 424 425 426 if len(Functs[f][1]) > 0: 427 header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f) 428 429 430 # 431 # Next we do the unicode ranges 432 # 433 434 for f in fkeys: 435 if len(Functs[f][1]) > 0: # only generate if unicode ranges present 436 rangeTable = Functs[f][1] 437 rangeTable.sort() # ascending tuple sequence 438 numShort = 0 439 numLong = 0 440 for rg in rangeTable: 441 if rg[1] < 0x10000: # if short value 442 if numShort == 0: # first occurence 443 pline = "static const xmlChSRange %s_srng[] = { " % f 444 else: 445 pline += ", " 446 numShort += 1 447 if len(pline) > 60: 448 output.write(pline + "\n") 449 pline = " " 450 pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) 451 else: # if long value 452 if numLong == 0: # first occurence 453 if numShort > 0: # if there were shorts, finish them off 454 output.write(pline + "};\n") 455 pline = "static const xmlChLRange %s_lrng[] = { " % f 456 else: 457 pline += ", " 458 numLong += 1 459 if len(pline) > 60: 460 output.write(pline + "\n") 461 pline = " " 462 pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) 463 output.write(pline + "};\n") # finish off last group 464 465 pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong) 466 if numShort > 0: 467 pline += "%s_srng" % f 468 else: 469 pline += "(xmlChSRangePtr)0" 470 if numLong > 0: 471 pline += ", %s_lrng" % f 472 else: 473 pline += ", (xmlChLRangePtr)0" 474 475 output.write(pline + "};\n\n") 476 477 output.write( 478 """ 479 /** 480 * xmlCharInRange: 481 * @val: character to be validated 482 * @rptr: pointer to range to be used to validate 483 * 484 * Does a binary search of the range table to determine if char 485 * is valid 486 * 487 * Returns: true if character valid, false otherwise 488 */ 489 int 490 xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) { 491 int low, high, mid; 492 const xmlChSRange *sptr; 493 const xmlChLRange *lptr; 494 495 if (rptr == NULL) return(0); 496 if (val < 0x10000) { /* is val in 'short' or 'long' array? */ 497 if (rptr->nbShortRange == 0) 498 return 0; 499 low = 0; 500 high = rptr->nbShortRange - 1; 501 sptr = rptr->shortRange; 502 while (low <= high) { 503 mid = (low + high) / 2; 504 if ((unsigned short) val < sptr[mid].low) { 505 high = mid - 1; 506 } else { 507 if ((unsigned short) val > sptr[mid].high) { 508 low = mid + 1; 509 } else { 510 return 1; 511 } 512 } 513 } 514 } else { 515 if (rptr->nbLongRange == 0) { 516 return 0; 517 } 518 low = 0; 519 high = rptr->nbLongRange - 1; 520 lptr = rptr->longRange; 521 while (low <= high) { 522 mid = (low + high) / 2; 523 if (val < lptr[mid].low) { 524 high = mid - 1; 525 } else { 526 if (val > lptr[mid].high) { 527 low = mid + 1; 528 } else { 529 return 1; 530 } 531 } 532 } 533 } 534 return 0; 535 } 536 537 """); 538 539 # 540 # finally, generate the ABI compatibility functions 541 # 542 for f in fkeys: 543 output.write(""" 544 /** 545 * %s: 546 * @ch: character to validate 547 * 548 * This function is DEPRECATED. 549 """ % f); 550 if max(Functs[f][0]) > 0: 551 output.write(" * Use %s_ch or %sQ instead" % (f, f)) 552 else: 553 output.write(" * Use %sQ instead" % f) 554 output.write(""" 555 * 556 * Returns true if argument valid, false otherwise 557 */ 558 """) 559 output.write("int\n%s(unsigned int ch) {\n return(%sQ(ch));\n}\n\n" % (f,f)) 560 header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f); 561 # 562 # Run complete - write trailers and close the output files 563 # 564 565 header.write(""" 566 #ifdef __cplusplus 567 } 568 #endif 569 #endif /* __XML_CHVALID_H__ */ 570 """) 571 572 header.close() 573 574 output.write("""#define bottom_chvalid 575 #include "elfgcchack.h" 576 """) 577 output.close() 578 579