Home | History | Annotate | Download | only in libxml2
      1 #!/usr/bin/python -u
      2 #
      3 # Original script modified in November 2003 to take advantage of
      4 # the character-validation range routines, and updated to the
      5 # current Unicode information (Version 4.0.1)
      6 #
      7 # NOTE: there is an 'alias' facility for blocks which are not present in
      8 #	the current release, but are needed for ABI compatibility.  This
      9 #	must be accomplished MANUALLY!  Please see the comments below under
     10 #     'blockAliases'
     11 #
     12 import sys
     13 import string
     14 import time
     15 
     16 webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
     17 sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
     18 
     19 #
     20 # blockAliases is a small hack - it is used for mapping block names which
     21 # were were used in the 3.1 release, but are missing or changed in the current
     22 # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
     23 blockAliases = []
     24 blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
     25 blockAliases.append("Greek:GreekandCoptic")
     26 blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
     27 	"SupplementaryPrivateUseArea-B")
     28 
     29 # minTableSize gives the minimum number of ranges which must be present
     30 # before a range table is produced.  If there are less than this
     31 # number, inline comparisons are generated
     32 minTableSize = 8
     33 
     34 (blockfile, catfile) = string.split(sources)
     35 
     36 
     37 #
     38 # Now process the "blocks" file, reducing it to a dictionary
     39 # indexed by blockname, containing a tuple with the applicable
     40 # block range
     41 #
     42 BlockNames = {}
     43 try:
     44     blocks = open(blockfile, "r")
     45 except:
     46     print "Missing %s, aborting ..." % blockfile
     47     sys.exit(1)
     48 
     49 for line in blocks.readlines():
     50     if line[0] == '#':
     51         continue
     52     line = string.strip(line)
     53     if line == '':
     54         continue
     55     try:
     56         fields = string.split(line, ';')
     57         range = string.strip(fields[0])
     58         (start, end) = string.split(range, "..")
     59         name = string.strip(fields[1])
     60         name = string.replace(name, ' ', '')
     61     except:
     62         print "Failed to process line: %s" % (line)
     63         continue
     64     start = "0x" + start
     65     end = "0x" + end
     66     try:
     67         BlockNames[name].append((start, end))
     68     except:
     69         BlockNames[name] = [(start, end)]
     70 blocks.close()
     71 print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
     72 
     73 for block in blockAliases:
     74     alias = string.split(block,':')
     75     alist = string.split(alias[1],',')
     76     for comp in alist:
     77         if BlockNames.has_key(comp):
     78             if alias[0] not in BlockNames:
     79                 BlockNames[alias[0]] = []
     80             for r in BlockNames[comp]:
     81                 BlockNames[alias[0]].append(r)
     82         else:
     83             print "Alias %s: %s not in Blocks" % (alias[0], comp)
     84             continue
     85 
     86 #
     87 # Next process the Categories file. This is more complex, since
     88 # the file is in code sequence, and we need to invert it.  We use
     89 # a dictionary with index category-name, with each entry containing
     90 # all the ranges (codepoints) of that category.  Note that category
     91 # names comprise two parts - the general category, and the "subclass"
     92 # within that category.  Therefore, both "general category" (which is
     93 # the first character of the 2-character category-name) and the full
     94 # (2-character) name are entered into this dictionary.
     95 #
     96 try:
     97     data = open(catfile, "r")
     98 except:
     99     print "Missing %s, aborting ..." % catfile
    100     sys.exit(1)
    101 
    102 nbchar = 0;
    103 Categories = {}
    104 for line in data.readlines():
    105     if line[0] == '#':
    106         continue
    107     line = string.strip(line)
    108     if line == '':
    109         continue
    110     try:
    111         fields = string.split(line, ';')
    112         point = string.strip(fields[0])
    113         value = 0
    114         while point != '':
    115             value = value * 16
    116             if point[0] >= '0' and point[0] <= '9':
    117                 value = value + ord(point[0]) - ord('0')
    118             elif point[0] >= 'A' and point[0] <= 'F':
    119                 value = value + 10 + ord(point[0]) - ord('A')
    120             elif point[0] >= 'a' and point[0] <= 'f':
    121                 value = value + 10 + ord(point[0]) - ord('a')
    122             point = point[1:]
    123         name = fields[2]
    124     except:
    125         print "Failed to process line: %s" % (line)
    126         continue
    127     
    128     nbchar = nbchar + 1
    129     # update entry for "full name"
    130     try:
    131         Categories[name].append(value)
    132     except:
    133         try:
    134             Categories[name] = [value]
    135         except:
    136             print "Failed to process line: %s" % (line)
    137     # update "general category" name
    138     try:
    139         Categories[name[0]].append(value)
    140     except:
    141         try:
    142             Categories[name[0]] = [value]
    143         except:
    144             print "Failed to process line: %s" % (line)
    145 
    146 blocks.close()
    147 print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
    148 
    149 #
    150 # The data is now all read.  Time to process it into a more useful form.
    151 #
    152 # reduce the number list into ranges
    153 for cat in Categories.keys():
    154     list = Categories[cat]
    155     start = -1
    156     prev = -1
    157     end = -1
    158     ranges = []
    159     for val in list:
    160         if start == -1:
    161             start = val
    162             prev = val
    163             continue
    164         elif val == prev + 1:
    165             prev = val
    166             continue
    167         elif prev == start:
    168             ranges.append((prev, prev))
    169             start = val
    170             prev = val
    171             continue
    172         else:
    173             ranges.append((start, prev))
    174             start = val
    175             prev = val
    176             continue
    177     if prev == start:
    178         ranges.append((prev, prev))
    179     else:
    180         ranges.append((start, prev))
    181     Categories[cat] = ranges
    182 
    183 #
    184 # Assure all data is in alphabetic order, since we will be doing binary
    185 # searches on the tables.
    186 #
    187 bkeys = BlockNames.keys()
    188 bkeys.sort()
    189 
    190 ckeys = Categories.keys()
    191 ckeys.sort()
    192 
    193 #
    194 # Generate the resulting files
    195 #
    196 try:
    197     header = open("include/libxml/xmlunicode.h", "w")
    198 except:
    199     print "Failed to open include/libxml/xmlunicode.h"
    200     sys.exit(1)
    201 
    202 try:
    203     output = open("xmlunicode.c", "w")
    204 except:
    205     print "Failed to open xmlunicode.c"
    206     sys.exit(1)
    207 
    208 date = time.asctime(time.localtime(time.time()))
    209 
    210 header.write(
    211 """/*
    212  * Summary: Unicode character APIs
    213  * Description: API for the Unicode character APIs
    214  *
    215  * This file is automatically generated from the
    216  * UCS description files of the Unicode Character Database
    217  * %s
    218  * using the genUnicode.py Python script.
    219  *
    220  * Generation date: %s
    221  * Sources: %s
    222  * Author: Daniel Veillard
    223  */
    224 
    225 #ifndef __XML_UNICODE_H__
    226 #define __XML_UNICODE_H__
    227 
    228 #include <libxml/xmlversion.h>
    229 
    230 #ifdef LIBXML_UNICODE_ENABLED
    231 
    232 #ifdef __cplusplus
    233 extern "C" {
    234 #endif
    235 
    236 """ % (webpage, date, sources));
    237 
    238 output.write(
    239 """/*
    240  * xmlunicode.c: this module implements the Unicode character APIs
    241  *
    242  * This file is automatically generated from the
    243  * UCS description files of the Unicode Character Database
    244  * %s
    245  * using the genUnicode.py Python script.
    246  *
    247  * Generation date: %s
    248  * Sources: %s
    249  * Daniel Veillard <veillard (at] redhat.com>
    250  */
    251 
    252 #define IN_LIBXML
    253 #include "libxml.h"
    254 
    255 #ifdef LIBXML_UNICODE_ENABLED
    256 
    257 #include <string.h>
    258 #include <libxml/xmlversion.h>
    259 #include <libxml/xmlunicode.h>
    260 #include <libxml/chvalid.h>
    261 
    262 typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
    263 
    264 typedef struct {
    265     const char *rangename;
    266     xmlIntFunc *func;
    267 } xmlUnicodeRange;
    268 
    269 typedef struct {
    270     xmlUnicodeRange *table;
    271     int		    numentries;
    272 } xmlUnicodeNameTable;
    273 
    274 
    275 static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
    276 
    277 static xmlUnicodeRange xmlUnicodeBlocks[] = {
    278 """ % (webpage, date, sources));
    279 
    280 flag = 0
    281 for block in bkeys:
    282     name = string.replace(block, '-', '')
    283     if flag:
    284         output.write(',\n')
    285     else:
    286         flag = 1
    287     output.write('  {"%s", xmlUCSIs%s}' % (block, name))
    288 output.write('};\n\n')
    289 
    290 output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
    291 flag = 0;
    292 for name in ckeys:
    293     if flag:
    294         output.write(',\n')
    295     else:
    296         flag = 1
    297     output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
    298 output.write('};\n\n')
    299 
    300 #
    301 # For any categories with more than minTableSize ranges we generate
    302 # a range table suitable for xmlCharInRange
    303 #
    304 for name in ckeys:
    305   if len(Categories[name]) > minTableSize:
    306     numshort = 0
    307     numlong = 0
    308     ranges = Categories[name]
    309     sptr = "NULL"
    310     lptr = "NULL"
    311     for range in ranges:
    312       (low, high) = range
    313       if high < 0x10000:
    314         if numshort == 0:
    315           pline = "static const xmlChSRange xml%sS[] = {" % name
    316           sptr = "xml%sS" % name
    317         else:
    318           pline += ", "
    319         numshort += 1
    320       else:
    321         if numlong == 0:
    322           if numshort > 0:
    323             output.write(pline + " };\n")
    324           pline = "static const xmlChLRange xml%sL[] = {" % name
    325           lptr = "xml%sL" % name
    326         else:
    327           pline += ", "
    328         numlong += 1
    329       if len(pline) > 60:
    330         output.write(pline + "\n")
    331         pline = "    "
    332       pline += "{%s, %s}" % (hex(low), hex(high))
    333     output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
    334          % (name, numshort, numlong, sptr, lptr))
    335 
    336 
    337 output.write(
    338 """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
    339 static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
    340 
    341 /**
    342  * xmlUnicodeLookup:
    343  * @tptr: pointer to the name table
    344  * @name: name to be found
    345  *
    346  * binary table lookup for user-supplied name
    347  *
    348  * Returns pointer to range function if found, otherwise NULL
    349  */
    350 static xmlIntFunc
    351 *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
    352     int low, high, mid, cmp;
    353     xmlUnicodeRange *sptr;
    354 
    355     if ((tptr == NULL) || (tname == NULL)) return(NULL);
    356 
    357     low = 0;
    358     high = tptr->numentries - 1;
    359     sptr = tptr->table;
    360     while (low <= high) {
    361 	mid = (low + high) / 2;
    362 	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
    363 	    return (sptr[mid].func);
    364 	if (cmp < 0)
    365 	    high = mid - 1;
    366 	else
    367 	    low = mid + 1;
    368     }
    369     return (NULL);    
    370 }
    371 
    372 """ % (len(BlockNames), len(Categories)) )
    373 
    374 for block in bkeys:
    375     name = string.replace(block, '-', '')
    376     header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
    377     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
    378     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
    379                  (block))
    380     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    381     output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
    382     flag = 0
    383     for (start, end) in BlockNames[block]:
    384         if flag:
    385             output.write(" ||\n           ")
    386         else:
    387             flag = 1
    388         output.write("((code >= %s) && (code <= %s))" % (start, end))
    389     output.write(");\n}\n\n")
    390 
    391 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
    392 output.write(
    393 """/**
    394  * xmlUCSIsBlock:
    395  * @code: UCS code point
    396  * @block: UCS block name
    397  *
    398  * Check whether the character is part of the UCS Block
    399  *
    400  * Returns 1 if true, 0 if false and -1 on unknown block
    401  */
    402 int
    403 xmlUCSIsBlock(int code, const char *block) {
    404     xmlIntFunc *func;
    405 
    406     func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
    407     if (func == NULL)
    408 	return (-1);
    409     return (func(code));
    410 }
    411 
    412 """)
    413 
    414 for name in ckeys:
    415     ranges = Categories[name]
    416     header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
    417     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
    418     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
    419                  (name))
    420     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    421     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
    422     if len(Categories[name]) > minTableSize:
    423         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
    424             % name)
    425     else:
    426         start = 1
    427         for range in ranges:
    428             (begin, end) = range;
    429             if start:
    430                 output.write("    return(");
    431                 start = 0
    432             else:
    433                 output.write(" ||\n           ");
    434             if (begin == end):
    435                 output.write("(code == %s)" % (hex(begin)))
    436             else:
    437                 output.write("((code >= %s) && (code <= %s))" % (
    438                          hex(begin), hex(end)))
    439     output.write(");\n}\n\n")
    440 
    441 header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
    442 output.write(
    443 """/**
    444  * xmlUCSIsCat:
    445  * @code: UCS code point
    446  * @cat: UCS Category name
    447  *
    448  * Check whether the character is part of the UCS Category
    449  *
    450  * Returns 1 if true, 0 if false and -1 on unknown category
    451  */
    452 int
    453 xmlUCSIsCat(int code, const char *cat) {
    454     xmlIntFunc *func;
    455 
    456     func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
    457     if (func == NULL)
    458 	return (-1);
    459     return (func(code));
    460 }
    461 
    462 #define bottom_xmlunicode
    463 #include "elfgcchack.h"
    464 #endif /* LIBXML_UNICODE_ENABLED */
    465 """)
    466 
    467 header.write("""
    468 #ifdef __cplusplus
    469 }
    470 #endif
    471 
    472 #endif /* LIBXML_UNICODE_ENABLED */
    473 
    474 #endif /* __XML_UNICODE_H__ */
    475 """);
    476 
    477 header.close()
    478 output.close()
    479