Home | History | Annotate | Download | only in libxml2
      1 #!/usr/bin/python -u
      2 #
      3 # Portions of this script have been (shamelessly) stolen from the
      4 # prior work of Daniel Veillard (genUnicode.py)
      5 #
      6 # I, however, take full credit for any bugs, errors or difficulties :-)
      7 #
      8 # William Brack
      9 # October 2003
     10 #
     11 # 18 October 2003
     12 # Modified to maintain binary compatibility with previous library versions
     13 # by adding a suffix 'Q' ('quick') to the macro generated for the original,
     14 # function, and adding generation of a function (with the original name) which
     15 # instantiates the macro.
     16 #
     17 
     18 import sys
     19 import string
     20 import time
     21 
     22 #
     23 # A routine to take a list of yes/no (1, 0) values and turn it
     24 # into a list of ranges.  This will later be used to determine whether
     25 # to generate single-byte lookup tables, or inline comparisons
     26 #
     27 def makeRange(lst):
     28     ret = []
     29     pos = 0
     30     while pos < len(lst):
     31 	try:		# index generates exception if not present
     32 	    s = lst[pos:].index(1)	# look for start of next range
     33 	except:
     34 	    break			# if no more, finished
     35 	pos += s			# pointer to start of possible range
     36 	try:
     37 	    e = lst[pos:].index(0)	# look for end of range
     38 	    e += pos
     39 	except:				# if no end, set to end of list
     40 	    e = len(lst)
     41 	ret.append((pos, e-1))		# append range tuple to list
     42 	pos = e + 1			# ready to check for next range
     43     return ret
     44 
     45 sources = "chvalid.def"			# input filename
     46 
     47 # minTableSize gives the minimum number of ranges which must be present
     48 # before a 256-byte lookup table is produced.  If there are less than this
     49 # number, a macro with inline comparisons is generated
     50 minTableSize = 6
     51 
     52 # dictionary of functions, key=name, element contains char-map and range-list
     53 Functs = {}
     54 
     55 state = 0
     56 
     57 try:
     58     defines = open("chvalid.def", "r")
     59 except:
     60     print "Missing chvalid.def, aborting ..."
     61     sys.exit(1)
     62 
     63 #
     64 # The lines in the .def file have three types:-
     65 #   name:   Defines a new function block
     66 #   ur:	    Defines individual or ranges of unicode values
     67 #   end:    Indicates the end of the function block
     68 #
     69 # These lines are processed below.
     70 #
     71 for line in defines.readlines():
     72     # ignore blank lines, or lines beginning with '#'
     73     if line[0] == '#':
     74         continue
     75     line = string.strip(line)
     76     if line == '':
     77         continue
     78     # split line into space-separated fields, then split on type
     79     try:
     80         fields = string.split(line, ' ')
     81 	#
     82 	# name line:
     83 	#   validate any previous function block already ended
     84 	#   validate this function not already defined
     85 	#   initialize an entry in the function dicitonary
     86 	#	including a mask table with no values yet defined
     87 	#
     88 	if fields[0] == 'name':
     89 	    name = fields[1]
     90 	    if state != 0:
     91 		print "'name' %s found before previous name" \
     92 		      "completed" % (fields[1])
     93 		continue
     94 	    state = 1
     95 	    if Functs.has_key(name):
     96 		print "name '%s' already present - may give" \
     97 		      " wrong results" % (name)
     98 	    else:
     99 		# dict entry with two list elements (chdata, rangedata)
    100 		Functs[name] = [ [], [] ]
    101 		for v in range(256):
    102 		    Functs[name][0].append(0)
    103 	#
    104 	# end line:
    105 	#   validate there was a preceding function name line
    106 	#   set state to show no current function active
    107 	#
    108 	elif fields[0] == 'end':
    109 	    if state == 0:
    110 		print "'end' found outside of function block"
    111 		continue
    112 	    state = 0
    113 
    114 	#
    115 	# ur line:
    116 	#   validate function has been defined
    117 	#   process remaining fields on the line, which may be either
    118 	#	individual unicode values or ranges of values
    119 	#
    120 	elif fields[0] == 'ur':
    121 	    if state != 1:
    122 		raise ValidationError, "'ur' found outside of 'name' block"
    123 	    for el in fields[1:]:
    124 		pos = string.find(el, '..')
    125 		# pos <=0 means not a range, so must be individual value
    126 		if pos <= 0:
    127 		    # cheap handling of hex or decimal values
    128 		    if el[0:2] == '0x':
    129 		        value = int(el[2:],16)
    130 		    elif el[0] == "'":
    131 			value = ord(el[1])
    132 		    else:
    133 			value = int(el)
    134 		    if ((value < 0) | (value > 0x1fffff)):
    135 			raise ValidationError, 'Illegal value (%s) in ch for'\
    136 				' name %s' % (el,name)
    137 		    # for ur we have only ranges (makes things simpler),
    138 		    # so convert val to range
    139 		    currange = (value, value)
    140 		# pos > 0 means this is a range, so isolate/validate
    141 		# the interval
    142 		else:
    143 		    # split the range into it's first-val, last-val
    144 		    (first, last) = string.split(el, "..")
    145 		    # convert values from text into binary
    146 		    if first[0:2] == '0x':	
    147 			start = int(first[2:],16)
    148 		    elif first[0] == "'":
    149 			start = ord(first[1])
    150 		    else:
    151 			start = int(first)
    152 		    if last[0:2] == '0x':
    153 			end = int(last[2:],16)
    154 		    elif last[0] == "'":
    155 			end = ord(last[1])
    156 		    else:
    157 			end = int(last)
    158 		    if (start < 0) | (end > 0x1fffff) | (start > end):
    159 			raise ValidationError, "Invalid range '%s'" % el
    160 		    currange = (start, end)
    161 		# common path - 'currange' has the range, now take care of it
    162 		# We split on single-byte values vs. multibyte
    163 		if currange[1] < 0x100:	# single-byte
    164 		    for ch in range(currange[0],currange[1]+1):
    165 			# validate that value not previously defined
    166 			if Functs[name][0][ch]:
    167 			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
    168 			    raise ValidationError, msg
    169 			Functs[name][0][ch] = 1
    170 		else:			# multi-byte
    171 		    if currange in Functs[name][1]:
    172 			raise ValidationError, "range already defined in" \
    173 				" function"
    174 		    else:
    175 			Functs[name][1].append(currange)
    176 
    177     except:
    178 	print "Failed to process line: %s" % (line)
    179 	raise
    180 #
    181 # At this point, the entire definition file has been processed.  Now we
    182 # enter the output phase, where we generate the two files chvalid.c and'
    183 # chvalid.h
    184 #
    185 # To do this, we first output the 'static' data (heading, fixed
    186 # definitions, etc.), then output the 'dynamic' data (the results
    187 # of the above processing), and finally output closing 'static' data
    188 # (e.g. the subroutine to process the ranges)
    189 #
    190 
    191 #
    192 # Generate the headings:
    193 #
    194 try:
    195     header = open("include/libxml/chvalid.h", "w")
    196 except:
    197     print "Failed to open include/libxml/chvalid.h"
    198     sys.exit(1)
    199 
    200 try:
    201     output = open("chvalid.c", "w")
    202 except:
    203     print "Failed to open chvalid.c"
    204     sys.exit(1)
    205 
    206 date = time.asctime(time.localtime(time.time()))
    207 
    208 header.write(
    209 """/*
    210  * Summary: Unicode character range checking
    211  * Description: this module exports interfaces for the character
    212  *               range validation APIs
    213  *
    214  * This file is automatically generated from the cvs source
    215  * definition files using the genChRanges.py Python script
    216  *
    217  * Generation date: %s
    218  * Sources: %s
    219  * Author: William Brack <wbrack (at] mmm.com.hk>
    220  */
    221 
    222 #ifndef __XML_CHVALID_H__
    223 #define __XML_CHVALID_H__
    224 
    225 #include <libxml/xmlversion.h>
    226 #include <libxml/xmlstring.h>
    227 
    228 #ifdef __cplusplus
    229 extern "C" {
    230 #endif
    231 
    232 /*
    233  * Define our typedefs and structures
    234  *
    235  */
    236 typedef struct _xmlChSRange xmlChSRange;
    237 typedef xmlChSRange *xmlChSRangePtr;
    238 struct _xmlChSRange {
    239     unsigned short	low;
    240     unsigned short	high;
    241 };
    242 
    243 typedef struct _xmlChLRange xmlChLRange;
    244 typedef xmlChLRange *xmlChLRangePtr;
    245 struct _xmlChLRange {
    246     unsigned int	low;
    247     unsigned int	high;
    248 };
    249 
    250 typedef struct _xmlChRangeGroup xmlChRangeGroup;
    251 typedef xmlChRangeGroup *xmlChRangeGroupPtr;
    252 struct _xmlChRangeGroup {
    253     int			nbShortRange;
    254     int			nbLongRange;
    255     const xmlChSRange	*shortRange;	/* points to an array of ranges */
    256     const xmlChLRange	*longRange;
    257 };
    258 
    259 /**
    260  * Range checking routine
    261  */
    262 XMLPUBFUN int XMLCALL
    263 		xmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
    264 
    265 """ % (date, sources));
    266 output.write(
    267 """/*
    268  * chvalid.c:	this module implements the character range
    269  *		validation APIs
    270  *
    271  * This file is automatically generated from the cvs source
    272  * definition files using the genChRanges.py Python script
    273  *
    274  * Generation date: %s
    275  * Sources: %s
    276  * William Brack <wbrack (at] mmm.com.hk>
    277  */
    278 
    279 #define IN_LIBXML
    280 #include "libxml.h"
    281 #include <libxml/chvalid.h>
    282 
    283 /*
    284  * The initial tables ({func_name}_tab) are used to validate whether a
    285  * single-byte character is within the specified group.  Each table
    286  * contains 256 bytes, with each byte representing one of the 256
    287  * possible characters.  If the table byte is set, the character is
    288  * allowed.
    289  *
    290  */
    291 """ % (date, sources));
    292 
    293 #
    294 # Now output the generated data.
    295 # We try to produce the best execution times.  Tests have shown that validation
    296 # with direct table lookup is, when there are a "small" number of valid items,
    297 # still not as fast as a sequence of inline compares.  So, if the single-byte
    298 # portion of a range has a "small" number of ranges, we output a macro for inline
    299 # compares, otherwise we output a 256-byte table and a macro to use it.
    300 #
    301 
    302 fkeys = Functs.keys()	# Dictionary of all defined functions
    303 fkeys.sort()		# Put some order to our output
    304 
    305 for f in fkeys:
    306 
    307 # First we convert the specified single-byte values into a group of ranges.
    308 # If the total number of such ranges is less than minTableSize, we generate
    309 # an inline macro for direct comparisons; if greater, we generate a lookup
    310 # table.
    311     if max(Functs[f][0]) > 0:	# only check if at least one entry
    312         rangeTable = makeRange(Functs[f][0])
    313 	numRanges = len(rangeTable)
    314 	if numRanges >= minTableSize:	# table is worthwhile
    315 	    header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
    316 	    header.write("""
    317 /**
    318  * %s_ch:
    319  * @c: char to validate
    320  *
    321  * Automatically generated by genChRanges.py
    322  */
    323 """ % f)
    324 	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
    325 
    326 	    # write the constant data to the code file
    327 	    output.write("const unsigned char %s_tab[256] = {\n" % f)
    328 	    pline = "   "
    329 	    for n in range(255):
    330 		pline += " 0x%02x," % Functs[f][0][n]
    331 		if len(pline) > 72:
    332 		    output.write(pline + "\n")
    333 		    pline = "   "
    334 	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
    335 
    336 	else:		# inline check is used
    337 	    # first another little optimisation - if space is present,
    338 	    # put it at the front of the list so it is checked first
    339 	    try:
    340 		ix = rangeTable.remove((0x20, 0x20))
    341 		rangeTable.insert(0, (0x20, 0x20))
    342 	    except:
    343 		pass
    344 	    firstFlag = 1
    345 	    
    346 	    header.write("""
    347 /**
    348  * %s_ch:
    349  * @c: char to validate
    350  *
    351  * Automatically generated by genChRanges.py
    352  */
    353 """ % f)
    354 	    # okay, I'm tired of the messy lineup - let's automate it!
    355 	    pline = "#define %s_ch(c)" % f
    356 	    # 'ntab' is number of tabs needed to position to col. 33 from name end
    357 	    ntab = 4 - (len(pline)) / 8
    358 	    if ntab < 0:
    359 		ntab = 0
    360 	    just = ""
    361 	    for i in range(ntab):
    362 		just += "\t"
    363 	    pline = pline + just + "("
    364 	    for rg in rangeTable:
    365 		if not firstFlag:
    366 		    pline += " || \\\n\t\t\t\t "
    367 		else:
    368 		    firstFlag = 0
    369 		if rg[0] == rg[1]:		# single value - check equal
    370 		    pline += "((c) == 0x%x)" % rg[0]
    371 		else:				# value range
    372 		# since we are doing char, also change range ending in 0xff
    373 		    if rg[1] != 0xff:
    374 		        pline += "((0x%x <= (c)) &&" % rg[0]
    375 		        pline += " ((c) <= 0x%x))" % rg[1]
    376 		    else:
    377 		        pline += " (0x%x <= (c))" % rg[0]
    378 	    pline += ")\n"
    379 	    header.write(pline)
    380 
    381     header.write("""
    382 /**
    383  * %sQ:
    384  * @c: char to validate
    385  *
    386  * Automatically generated by genChRanges.py
    387  */
    388 """ % f)
    389     pline = "#define %sQ(c)" % f
    390     ntab = 4 - (len(pline)) / 8
    391     if ntab < 0:
    392 	ntab = 0
    393     just = ""
    394     for i in range(ntab):
    395 	just += "\t"
    396     header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
    397     if max(Functs[f][0]) > 0:
    398 	header.write("%s_ch((c)) :" % f)
    399     else:
    400 	header.write("0 :")
    401 
    402     # if no ranges defined, value invalid if >= 0x100
    403     numRanges = len(Functs[f][1])
    404     if numRanges == 0:
    405 	header.write(" 0)\n\n")
    406     else:
    407 	if numRanges >= minTableSize:
    408 	    header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
    409 	else:		# if < minTableSize, generate inline code
    410 	    firstFlag = 1
    411 	    for rg in Functs[f][1]:
    412 		if not firstFlag:
    413 		    pline += " || \\\n\t\t\t\t "
    414 		else:
    415 		    firstFlag = 0
    416 		    pline = "\\\n\t\t\t\t("
    417 		if rg[0] == rg[1]:		# single value - check equal
    418 		    pline += "((c) == 0x%x)" % rg[0]
    419 		else:				# value range
    420 		    pline += "((0x%x <= (c)) &&" % rg[0]
    421 		    pline += " ((c) <= 0x%x))" % rg[1]
    422 	    pline += "))\n\n"
    423 	    header.write(pline)
    424 
    425 
    426     if len(Functs[f][1]) > 0:
    427 	header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
    428 
    429 
    430 #
    431 # Next we do the unicode ranges
    432 #
    433 
    434 for f in fkeys:
    435     if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
    436 	rangeTable = Functs[f][1]
    437 	rangeTable.sort()	# ascending tuple sequence
    438 	numShort = 0
    439 	numLong  = 0
    440 	for rg in rangeTable:
    441 	    if rg[1] < 0x10000:	# if short value
    442 		if numShort == 0:	# first occurence
    443 		    pline = "static const xmlChSRange %s_srng[] = { " % f
    444 		else:
    445 		    pline += ", "
    446 		numShort += 1
    447 		if len(pline) > 60:
    448 		    output.write(pline + "\n")
    449 		    pline = "    "
    450 		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
    451 	    else:		# if long value
    452 		if numLong == 0:	# first occurence
    453 		    if numShort > 0:	# if there were shorts, finish them off
    454 			output.write(pline + "};\n")
    455 		    pline = "static const xmlChLRange %s_lrng[] = { " % f
    456 		else:
    457 		    pline += ", "
    458 		numLong += 1
    459 		if len(pline) > 60:
    460 		    output.write(pline + "\n")
    461 		    pline = "    "
    462 		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
    463 	output.write(pline + "};\n")	# finish off last group
    464 
    465 	pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
    466 	if numShort > 0:
    467 	    pline += "%s_srng" % f
    468 	else:
    469 	    pline += "(xmlChSRangePtr)0"
    470 	if numLong > 0:
    471 	    pline += ", %s_lrng" % f
    472 	else:
    473 	    pline += ", (xmlChLRangePtr)0"
    474 	
    475 	output.write(pline + "};\n\n")
    476 
    477 output.write(
    478 """
    479 /**
    480  * xmlCharInRange:
    481  * @val: character to be validated
    482  * @rptr: pointer to range to be used to validate
    483  *
    484  * Does a binary search of the range table to determine if char
    485  * is valid
    486  *
    487  * Returns: true if character valid, false otherwise
    488  */
    489 int
    490 xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
    491     int low, high, mid;
    492     const xmlChSRange *sptr;
    493     const xmlChLRange *lptr;
    494 
    495     if (rptr == NULL) return(0);
    496     if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
    497 	if (rptr->nbShortRange == 0)
    498 	    return 0;
    499 	low = 0;
    500 	high = rptr->nbShortRange - 1;
    501 	sptr = rptr->shortRange;
    502 	while (low <= high) {
    503 	    mid = (low + high) / 2;
    504 	    if ((unsigned short) val < sptr[mid].low) {
    505 		high = mid - 1;
    506 	    } else {
    507 	        if ((unsigned short) val > sptr[mid].high) {
    508 		    low = mid + 1;
    509 		} else {
    510 		    return 1;
    511 		}
    512 	    }
    513 	}
    514     } else {
    515 	if (rptr->nbLongRange == 0) {
    516 	    return 0;
    517 	}
    518 	low = 0;
    519 	high = rptr->nbLongRange - 1;
    520 	lptr = rptr->longRange;
    521 	while (low <= high) {
    522 	    mid = (low + high) / 2;
    523 	    if (val < lptr[mid].low) {
    524 		high = mid - 1;
    525 	    } else {
    526 	        if (val > lptr[mid].high) {
    527 		    low = mid + 1;
    528 		} else {
    529 		    return 1;
    530 		}
    531 	    }
    532 	}
    533     }
    534     return 0;
    535 }
    536 
    537 """);
    538 
    539 #
    540 # finally, generate the ABI compatibility functions
    541 #
    542 for f in fkeys:
    543     output.write("""
    544 /**
    545  * %s:
    546  * @ch:  character to validate
    547  *
    548  * This function is DEPRECATED.
    549 """ % f);
    550     if max(Functs[f][0]) > 0:
    551         output.write(" * Use %s_ch or %sQ instead" % (f, f))
    552     else:
    553         output.write(" * Use %sQ instead" % f)
    554     output.write("""
    555  *
    556  * Returns true if argument valid, false otherwise
    557  */
    558 """)
    559     output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
    560     header.write("XMLPUBFUN int XMLCALL\n\t\t%s(unsigned int ch);\n" % f);
    561 #
    562 # Run complete - write trailers and close the output files
    563 #
    564 
    565 header.write("""
    566 #ifdef __cplusplus
    567 }
    568 #endif
    569 #endif /* __XML_CHVALID_H__ */
    570 """)
    571 
    572 header.close()
    573 
    574 output.write("""#define bottom_chvalid
    575 #include "elfgcchack.h"
    576 """)
    577 output.close()
    578 
    579