Home | History | Annotate | Download | only in doc
      1 #!/usr/bin/python -u
      2 #
      3 # imports the API description and fills up a database with
      4 # name relevance to modules, functions or web pages
      5 #
      6 # Operation needed:
      7 # =================
      8 #
      9 # install mysqld, the python wrappers for mysql and libxml2, start mysqld
     10 # Change the root passwd of mysql:
     11 #    mysqladmin -u root password new_password
     12 # Create the new database xmlsoft
     13 #    mysqladmin -p create xmlsoft
     14 # Create a database user 'veillard' and give him passord access
     15 # change veillard and abcde with the right user name and passwd
     16 #    mysql -p
     17 #    password:
     18 #    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
     19 #           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
     20 #
     21 # As the user check the access:
     22 #    mysql -p xmlsoft
     23 #    Enter password:
     24 #    Welcome to the MySQL monitor....
     25 #    mysql> use xmlsoft
     26 #    Database changed
     27 #    mysql> quit
     28 #    Bye
     29 #
     30 # Then run the script in the doc subdir, it will create the symbols and
     31 # word tables and populate them with informations extracted from 
     32 # the libxml2-api.xml API description, and make then accessible read-only
     33 # by nobody@loaclhost the user expected to be Apache's one
     34 #
     35 # On the Apache configuration, make sure you have php support enabled
     36 #
     37 
     38 import MySQLdb
     39 import libxml2
     40 import sys
     41 import string
     42 import os
     43 
     44 #
     45 # We are not interested in parsing errors here
     46 #
     47 def callback(ctx, str):
     48     return
     49 libxml2.registerErrorHandler(callback, None)
     50 
     51 #
     52 # The dictionary of tables required and the SQL command needed
     53 # to create them
     54 #
     55 TABLES={
     56   "symbols" : """CREATE TABLE symbols (
     57            name varchar(255) BINARY NOT NULL,
     58 	   module varchar(255) BINARY NOT NULL,
     59            type varchar(25) NOT NULL,
     60 	   descr varchar(255),
     61 	   UNIQUE KEY name (name),
     62 	   KEY module (module))""",
     63   "words" : """CREATE TABLE words (
     64            name varchar(50) BINARY NOT NULL,
     65 	   symbol varchar(255) BINARY NOT NULL,
     66            relevance int,
     67 	   KEY name (name),
     68 	   KEY symbol (symbol),
     69 	   UNIQUE KEY ID (name, symbol))""",
     70   "wordsHTML" : """CREATE TABLE wordsHTML (
     71            name varchar(50) BINARY NOT NULL,
     72 	   resource varchar(255) BINARY NOT NULL,
     73 	   section varchar(255),
     74 	   id varchar(50),
     75            relevance int,
     76 	   KEY name (name),
     77 	   KEY resource (resource),
     78 	   UNIQUE KEY ref (name, resource))""",
     79   "wordsArchive" : """CREATE TABLE wordsArchive (
     80            name varchar(50) BINARY NOT NULL,
     81 	   ID int(11) NOT NULL,
     82            relevance int,
     83 	   KEY name (name),
     84 	   UNIQUE KEY ref (name, ID))""",
     85   "pages" : """CREATE TABLE pages (
     86            resource varchar(255) BINARY NOT NULL,
     87 	   title varchar(255) BINARY NOT NULL,
     88 	   UNIQUE KEY name (resource))""",
     89   "archives" : """CREATE TABLE archives (
     90            ID int(11) NOT NULL auto_increment,
     91            resource varchar(255) BINARY NOT NULL,
     92 	   title varchar(255) BINARY NOT NULL,
     93 	   UNIQUE KEY id (ID,resource(255)),
     94 	   INDEX (ID),
     95 	   INDEX (resource))""",
     96   "Queries" : """CREATE TABLE Queries (
     97            ID int(11) NOT NULL auto_increment,
     98 	   Value varchar(50) NOT NULL,
     99 	   Count int(11) NOT NULL,
    100 	   UNIQUE KEY id (ID,Value(35)),
    101 	   INDEX (ID))""",
    102   "AllQueries" : """CREATE TABLE AllQueries (
    103            ID int(11) NOT NULL auto_increment,
    104 	   Value varchar(50) NOT NULL,
    105 	   Count int(11) NOT NULL,
    106 	   UNIQUE KEY id (ID,Value(35)),
    107 	   INDEX (ID))""",
    108 }
    109 
    110 #
    111 # The XML API description file to parse
    112 #
    113 API="libxml2-api.xml"
    114 DB=None
    115 
    116 #########################################################################
    117 #									#
    118 #                  MySQL database interfaces				#
    119 #									#
    120 #########################################################################
    121 def createTable(db, name):
    122     global TABLES
    123 
    124     if db == None:
    125         return -1
    126     if name == None:
    127         return -1
    128     c = db.cursor()
    129 
    130     ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
    131     if ret == 1:
    132         print "Removed table %s" % (name)
    133     print "Creating table %s" % (name)
    134     try:
    135         ret = c.execute(TABLES[name])
    136     except:
    137         print "Failed to create table %s" % (name)
    138 	return -1
    139     return ret
    140 
    141 def checkTables(db, verbose = 1):
    142     global TABLES
    143 
    144     if db == None:
    145         return -1
    146     c = db.cursor()
    147     nbtables = c.execute("show tables")
    148     if verbose:
    149 	print "Found %d tables" % (nbtables)
    150     tables = {}
    151     i = 0
    152     while i < nbtables:
    153         l = c.fetchone()
    154 	name = l[0]
    155 	tables[name] = {}
    156         i = i + 1
    157 
    158     for table in TABLES.keys():
    159         if not tables.has_key(table):
    160 	    print "table %s missing" % (table)
    161 	    createTable(db, table)
    162 	try:
    163 	    ret = c.execute("SELECT count(*) from %s" % table);
    164 	    row = c.fetchone()
    165 	    if verbose:
    166 		print "Table %s contains %d records" % (table, row[0])
    167 	except:
    168 	    print "Troubles with table %s : repairing" % (table)
    169 	    ret = c.execute("repair table %s" % table);
    170 	    print "repairing returned %d" % (ret)
    171 	    ret = c.execute("SELECT count(*) from %s" % table);
    172 	    row = c.fetchone()
    173 	    print "Table %s contains %d records" % (table, row[0])
    174     if verbose:
    175 	print "checkTables finished"
    176 
    177     # make sure apache can access the tables read-only
    178     try:
    179 	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
    180 	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
    181     except:
    182         pass
    183     return 0
    184     
    185 def openMySQL(db="xmlsoft", passwd=None, verbose = 1):
    186     global DB
    187 
    188     if passwd == None:
    189         try:
    190 	    passwd = os.environ["MySQL_PASS"]
    191 	except:
    192 	    print "No password available, set environment MySQL_PASS"
    193 	    sys.exit(1)
    194 
    195     DB = MySQLdb.connect(passwd=passwd, db=db)
    196     if DB == None:
    197         return -1
    198     ret = checkTables(DB, verbose)
    199     return ret
    200 
    201 def updateWord(name, symbol, relevance):
    202     global DB
    203 
    204     if DB == None:
    205         openMySQL()
    206     if DB == None:
    207         return -1
    208     if name == None:
    209         return -1
    210     if symbol == None:
    211         return -1
    212 
    213     c = DB.cursor()
    214     try:
    215 	ret = c.execute(
    216 """INSERT INTO words (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
    217 		(name, symbol, relevance))
    218     except:
    219         try:
    220 	    ret = c.execute(
    221     """UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'""" %
    222 		    (relevance, name, symbol))
    223 	except:
    224 	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
    225 	    print "UPDATE words SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
    226 	    print sys.exc_type, sys.exc_value
    227 	    return -1
    228 	     
    229     return ret
    230 
    231 def updateSymbol(name, module, type, desc):
    232     global DB
    233 
    234     updateWord(name, name, 50)
    235     if DB == None:
    236         openMySQL()
    237     if DB == None:
    238         return -1
    239     if name == None:
    240         return -1
    241     if module == None:
    242         return -1
    243     if type == None:
    244         return -1
    245 
    246     try:
    247 	desc = string.replace(desc, "'", " ")
    248 	l = string.split(desc, ".")
    249 	desc = l[0]
    250 	desc = desc[0:99]
    251     except:
    252         desc = ""
    253 
    254     c = DB.cursor()
    255     try:
    256 	ret = c.execute(
    257 """INSERT INTO symbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
    258                     (name, module, type, desc))
    259     except:
    260         try:
    261 	    ret = c.execute(
    262 """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
    263                     (module, type, desc, name))
    264         except:
    265 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
    266 	    print """UPDATE symbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
    267 	    print sys.exc_type, sys.exc_value
    268 	    return -1
    269 	     
    270     return ret
    271         
    272 def addFunction(name, module, desc = ""):
    273     return updateSymbol(name, module, 'function', desc)
    274 
    275 def addMacro(name, module, desc = ""):
    276     return updateSymbol(name, module, 'macro', desc)
    277 
    278 def addEnum(name, module, desc = ""):
    279     return updateSymbol(name, module, 'enum', desc)
    280 
    281 def addStruct(name, module, desc = ""):
    282     return updateSymbol(name, module, 'struct', desc)
    283 
    284 def addConst(name, module, desc = ""):
    285     return updateSymbol(name, module, 'const', desc)
    286 
    287 def addType(name, module, desc = ""):
    288     return updateSymbol(name, module, 'type', desc)
    289 
    290 def addFunctype(name, module, desc = ""):
    291     return updateSymbol(name, module, 'functype', desc)
    292 
    293 def addPage(resource, title):
    294     global DB
    295 
    296     if DB == None:
    297         openMySQL()
    298     if DB == None:
    299         return -1
    300     if resource == None:
    301         return -1
    302 
    303     c = DB.cursor()
    304     try:
    305 	ret = c.execute(
    306 	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
    307                     (resource, title))
    308     except:
    309         try:
    310 	    ret = c.execute(
    311 		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
    312                     (title, resource))
    313         except:
    314 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
    315 	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
    316 	    print sys.exc_type, sys.exc_value
    317 	    return -1
    318 	     
    319     return ret
    320 
    321 def updateWordHTML(name, resource, desc, id, relevance):
    322     global DB
    323 
    324     if DB == None:
    325         openMySQL()
    326     if DB == None:
    327         return -1
    328     if name == None:
    329         return -1
    330     if resource == None:
    331         return -1
    332     if id == None:
    333         id = ""
    334     if desc == None:
    335         desc = ""
    336     else:
    337 	try:
    338 	    desc = string.replace(desc, "'", " ")
    339 	    desc = desc[0:99]
    340 	except:
    341 	    desc = ""
    342 
    343     c = DB.cursor()
    344     try:
    345 	ret = c.execute(
    346 """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
    347                     (name, resource, desc, id, relevance))
    348     except:
    349         try:
    350 	    ret = c.execute(
    351 """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
    352                     (desc, id, relevance, name, resource))
    353         except:
    354 	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
    355 	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
    356 	    print sys.exc_type, sys.exc_value
    357 	    return -1
    358 	     
    359     return ret
    360 
    361 def checkXMLMsgArchive(url):
    362     global DB
    363 
    364     if DB == None:
    365         openMySQL()
    366     if DB == None:
    367         return -1
    368     if url == None:
    369         return -1
    370 
    371     c = DB.cursor()
    372     try:
    373 	ret = c.execute(
    374 	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
    375 	row = c.fetchone()
    376 	if row == None:
    377 	    return -1
    378     except:
    379 	return -1
    380 	     
    381     return row[0]
    382     
    383 def addXMLMsgArchive(url, title):
    384     global DB
    385 
    386     if DB == None:
    387         openMySQL()
    388     if DB == None:
    389         return -1
    390     if url == None:
    391         return -1
    392     if title == None:
    393         title = ""
    394     else:
    395 	title = string.replace(title, "'", " ")
    396 	title = title[0:99]
    397 
    398     c = DB.cursor()
    399     try:
    400         cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
    401         ret = c.execute(cmd)
    402 	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
    403         ret = c.execute(cmd)
    404 	row = c.fetchone()
    405 	if row == None:
    406 	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
    407 	    return -1
    408     except:
    409         print "addXMLMsgArchive failed command: %s" % (cmd)
    410 	return -1
    411 	     
    412     return((int)(row[0]))
    413 
    414 def updateWordArchive(name, id, relevance):
    415     global DB
    416 
    417     if DB == None:
    418         openMySQL()
    419     if DB == None:
    420         return -1
    421     if name == None:
    422         return -1
    423     if id == None:
    424         return -1
    425 
    426     c = DB.cursor()
    427     try:
    428 	ret = c.execute(
    429 """INSERT INTO wordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
    430                     (name, id, relevance))
    431     except:
    432         try:
    433 	    ret = c.execute(
    434 """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
    435                     (relevance, name, id))
    436         except:
    437 	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
    438 	    print """UPDATE wordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
    439 	    print sys.exc_type, sys.exc_value
    440 	    return -1
    441 
    442     return ret
    443 
    444 #########################################################################
    445 #									#
    446 #                  Word dictionary and analysis routines		#
    447 #									#
    448 #########################################################################
    449 
    450 #
    451 # top 100 english word without the one len < 3 + own set
    452 #
    453 dropWords = {
    454     'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
    455     'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
    456     'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
    457     'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
    458     'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
    459     'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
    460     'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
    461     'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
    462     'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
    463     'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
    464     'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
    465     'down':0,
    466     'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
    467 }
    468 
    469 wordsDict = {}
    470 wordsDictHTML = {}
    471 wordsDictArchive = {}
    472 
    473 def cleanupWordsString(str):
    474     str = string.replace(str, ".", " ")
    475     str = string.replace(str, "!", " ")
    476     str = string.replace(str, "?", " ")
    477     str = string.replace(str, ",", " ")
    478     str = string.replace(str, "'", " ")
    479     str = string.replace(str, '"', " ")
    480     str = string.replace(str, ";", " ")
    481     str = string.replace(str, "(", " ")
    482     str = string.replace(str, ")", " ")
    483     str = string.replace(str, "{", " ")
    484     str = string.replace(str, "}", " ")
    485     str = string.replace(str, "<", " ")
    486     str = string.replace(str, ">", " ")
    487     str = string.replace(str, "=", " ")
    488     str = string.replace(str, "/", " ")
    489     str = string.replace(str, "*", " ")
    490     str = string.replace(str, ":", " ")
    491     str = string.replace(str, "#", " ")
    492     str = string.replace(str, "\\", " ")
    493     str = string.replace(str, "\n", " ")
    494     str = string.replace(str, "\r", " ")
    495     str = string.replace(str, "\xc2", " ")
    496     str = string.replace(str, "\xa0", " ")
    497     return str
    498     
    499 def cleanupDescrString(str):
    500     str = string.replace(str, "'", " ")
    501     str = string.replace(str, "\n", " ")
    502     str = string.replace(str, "\r", " ")
    503     str = string.replace(str, "\xc2", " ")
    504     str = string.replace(str, "\xa0", " ")
    505     l = string.split(str)
    506     str = string.join(str)
    507     return str
    508 
    509 def splitIdentifier(str):
    510     ret = []
    511     while str != "":
    512         cur = string.lower(str[0])
    513 	str = str[1:]
    514 	if ((cur < 'a') or (cur > 'z')):
    515 	    continue
    516 	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
    517 	    cur = cur + string.lower(str[0])
    518 	    str = str[1:]
    519 	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
    520 	    cur = cur + str[0]
    521 	    str = str[1:]
    522 	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
    523 	    str = str[1:]
    524 	ret.append(cur)
    525     return ret
    526 
    527 def addWord(word, module, symbol, relevance):
    528     global wordsDict
    529 
    530     if word == None or len(word) < 3:
    531         return -1
    532     if module == None or symbol == None:
    533         return -1
    534     if dropWords.has_key(word):
    535         return 0
    536     if ord(word[0]) > 0x80:
    537         return 0
    538 
    539     if wordsDict.has_key(word):
    540         d = wordsDict[word]
    541 	if d == None:
    542 	    return 0
    543 	if len(d) > 500:
    544 	    wordsDict[word] = None
    545 	    return 0
    546 	try:
    547 	    relevance = relevance + d[(module, symbol)]
    548 	except:
    549 	    pass
    550     else:
    551         wordsDict[word] = {}
    552     wordsDict[word][(module, symbol)] = relevance
    553     return relevance
    554     
    555 def addString(str, module, symbol, relevance):
    556     if str == None or len(str) < 3:
    557         return -1
    558     ret = 0
    559     str = cleanupWordsString(str)
    560     l = string.split(str)
    561     for word in l:
    562 	if len(word) > 2:
    563 	    ret = ret + addWord(word, module, symbol, 5)
    564 
    565     return ret
    566 
    567 def addWordHTML(word, resource, id, section, relevance):
    568     global wordsDictHTML
    569 
    570     if word == None or len(word) < 3:
    571         return -1
    572     if resource == None or section == None:
    573         return -1
    574     if dropWords.has_key(word):
    575         return 0
    576     if ord(word[0]) > 0x80:
    577         return 0
    578 
    579     section = cleanupDescrString(section)
    580 
    581     if wordsDictHTML.has_key(word):
    582         d = wordsDictHTML[word]
    583 	if d == None:
    584 	    print "skipped %s" % (word)
    585 	    return 0
    586 	try:
    587 	    (r,i,s) = d[resource]
    588 	    if i != None:
    589 	        id = i
    590 	    if s != None:
    591 	        section = s
    592 	    relevance = relevance + r
    593 	except:
    594 	    pass
    595     else:
    596         wordsDictHTML[word] = {}
    597     d = wordsDictHTML[word];
    598     d[resource] = (relevance, id, section)
    599     return relevance
    600     
    601 def addStringHTML(str, resource, id, section, relevance):
    602     if str == None or len(str) < 3:
    603         return -1
    604     ret = 0
    605     str = cleanupWordsString(str)
    606     l = string.split(str)
    607     for word in l:
    608 	if len(word) > 2:
    609 	    try:
    610 		r = addWordHTML(word, resource, id, section, relevance)
    611 		if r < 0:
    612 		    print "addWordHTML failed: %s %s" % (word, resource)
    613 		ret = ret + r
    614 	    except:
    615 		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
    616 		print sys.exc_type, sys.exc_value
    617 
    618     return ret
    619 
    620 def addWordArchive(word, id, relevance):
    621     global wordsDictArchive
    622 
    623     if word == None or len(word) < 3:
    624         return -1
    625     if id == None or id == -1:
    626         return -1
    627     if dropWords.has_key(word):
    628         return 0
    629     if ord(word[0]) > 0x80:
    630         return 0
    631 
    632     if wordsDictArchive.has_key(word):
    633         d = wordsDictArchive[word]
    634 	if d == None:
    635 	    print "skipped %s" % (word)
    636 	    return 0
    637 	try:
    638 	    r = d[id]
    639 	    relevance = relevance + r
    640 	except:
    641 	    pass
    642     else:
    643         wordsDictArchive[word] = {}
    644     d = wordsDictArchive[word];
    645     d[id] = relevance
    646     return relevance
    647     
    648 def addStringArchive(str, id, relevance):
    649     if str == None or len(str) < 3:
    650         return -1
    651     ret = 0
    652     str = cleanupWordsString(str)
    653     l = string.split(str)
    654     for word in l:
    655         i = len(word)
    656 	if i > 2:
    657 	    try:
    658 		r = addWordArchive(word, id, relevance)
    659 		if r < 0:
    660 		    print "addWordArchive failed: %s %s" % (word, id)
    661 		else:
    662 		    ret = ret + r
    663 	    except:
    664 		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
    665 		print sys.exc_type, sys.exc_value
    666     return ret
    667 
    668 #########################################################################
    669 #									#
    670 #                  XML API description analysis				#
    671 #									#
    672 #########################################################################
    673 
    674 def loadAPI(filename):
    675     doc = libxml2.parseFile(filename)
    676     print "loaded %s" % (filename)
    677     return doc
    678 
    679 def foundExport(file, symbol):
    680     if file == None:
    681         return 0
    682     if symbol == None:
    683         return 0
    684     addFunction(symbol, file)
    685     l = splitIdentifier(symbol)
    686     for word in l:
    687 	addWord(word, file, symbol, 10)
    688     return 1
    689      
    690 def analyzeAPIFile(top):
    691     count = 0
    692     name = top.prop("name")
    693     cur = top.children
    694     while cur != None:
    695         if cur.type == 'text':
    696 	    cur = cur.next
    697 	    continue
    698 	if cur.name == "exports":
    699 	    count = count + foundExport(name, cur.prop("symbol"))
    700 	else:
    701 	    print "unexpected element %s in API doc <file name='%s'>" % (name)
    702         cur = cur.next
    703     return count
    704 
    705 def analyzeAPIFiles(top):
    706     count = 0
    707     cur = top.children
    708         
    709     while cur != None:
    710         if cur.type == 'text':
    711 	    cur = cur.next
    712 	    continue
    713 	if cur.name == "file":
    714 	    count = count + analyzeAPIFile(cur)
    715 	else:
    716 	    print "unexpected element %s in API doc <files>" % (cur.name)
    717         cur = cur.next
    718     return count
    719 
    720 def analyzeAPIEnum(top):
    721     file = top.prop("file")
    722     if file == None:
    723         return 0
    724     symbol = top.prop("name")
    725     if symbol == None:
    726         return 0
    727 
    728     addEnum(symbol, file)
    729     l = splitIdentifier(symbol)
    730     for word in l:
    731 	addWord(word, file, symbol, 10)
    732 
    733     return 1
    734 
    735 def analyzeAPIConst(top):
    736     file = top.prop("file")
    737     if file == None:
    738         return 0
    739     symbol = top.prop("name")
    740     if symbol == None:
    741         return 0
    742 
    743     addConst(symbol, file)
    744     l = splitIdentifier(symbol)
    745     for word in l:
    746 	addWord(word, file, symbol, 10)
    747 
    748     return 1
    749 
    750 def analyzeAPIType(top):
    751     file = top.prop("file")
    752     if file == None:
    753         return 0
    754     symbol = top.prop("name")
    755     if symbol == None:
    756         return 0
    757 
    758     addType(symbol, file)
    759     l = splitIdentifier(symbol)
    760     for word in l:
    761 	addWord(word, file, symbol, 10)
    762     return 1
    763 
    764 def analyzeAPIFunctype(top):
    765     file = top.prop("file")
    766     if file == None:
    767         return 0
    768     symbol = top.prop("name")
    769     if symbol == None:
    770         return 0
    771 
    772     addFunctype(symbol, file)
    773     l = splitIdentifier(symbol)
    774     for word in l:
    775 	addWord(word, file, symbol, 10)
    776     return 1
    777 
    778 def analyzeAPIStruct(top):
    779     file = top.prop("file")
    780     if file == None:
    781         return 0
    782     symbol = top.prop("name")
    783     if symbol == None:
    784         return 0
    785 
    786     addStruct(symbol, file)
    787     l = splitIdentifier(symbol)
    788     for word in l:
    789 	addWord(word, file, symbol, 10)
    790 
    791     info = top.prop("info")
    792     if info != None:
    793 	info = string.replace(info, "'", " ")
    794 	info = string.strip(info)
    795 	l = string.split(info)
    796 	for word in l:
    797 	    if len(word) > 2:
    798 		addWord(word, file, symbol, 5)
    799     return 1
    800 
    801 def analyzeAPIMacro(top):
    802     file = top.prop("file")
    803     if file == None:
    804         return 0
    805     symbol = top.prop("name")
    806     if symbol == None:
    807         return 0
    808     symbol = string.replace(symbol, "'", " ")
    809     symbol = string.strip(symbol)
    810 
    811     info = None
    812     cur = top.children
    813     while cur != None:
    814         if cur.type == 'text':
    815 	    cur = cur.next
    816 	    continue
    817 	if cur.name == "info":
    818 	    info = cur.content
    819 	    break
    820         cur = cur.next
    821 
    822     l = splitIdentifier(symbol)
    823     for word in l:
    824 	addWord(word, file, symbol, 10)
    825 
    826     if info == None:
    827 	addMacro(symbol, file)
    828         print "Macro %s description has no <info>" % (symbol)
    829         return 0
    830 
    831     info = string.replace(info, "'", " ")
    832     info = string.strip(info)
    833     addMacro(symbol, file, info)
    834     l = string.split(info)
    835     for word in l:
    836 	if len(word) > 2:
    837 	    addWord(word, file, symbol, 5)
    838     return 1
    839 
    840 def analyzeAPIFunction(top):
    841     file = top.prop("file")
    842     if file == None:
    843         return 0
    844     symbol = top.prop("name")
    845     if symbol == None:
    846         return 0
    847 
    848     symbol = string.replace(symbol, "'", " ")
    849     symbol = string.strip(symbol)
    850     info = None
    851     cur = top.children
    852     while cur != None:
    853         if cur.type == 'text':
    854 	    cur = cur.next
    855 	    continue
    856 	if cur.name == "info":
    857 	    info = cur.content
    858 	elif cur.name == "return":
    859 	    rinfo = cur.prop("info")
    860 	    if rinfo != None:
    861 		rinfo = string.replace(rinfo, "'", " ")
    862 		rinfo = string.strip(rinfo)
    863 	        addString(rinfo, file, symbol, 7)
    864 	elif cur.name == "arg":
    865 	    ainfo = cur.prop("info")
    866 	    if ainfo != None:
    867 		ainfo = string.replace(ainfo, "'", " ")
    868 		ainfo = string.strip(ainfo)
    869 	        addString(ainfo, file, symbol, 5)
    870 	    name = cur.prop("name")
    871 	    if name != None:
    872 		name = string.replace(name, "'", " ")
    873 		name = string.strip(name)
    874 	        addWord(name, file, symbol, 7)
    875         cur = cur.next
    876     if info == None:
    877         print "Function %s description has no <info>" % (symbol)
    878 	addFunction(symbol, file, "")
    879     else:
    880         info = string.replace(info, "'", " ")
    881 	info = string.strip(info)
    882 	addFunction(symbol, file, info)
    883         addString(info, file, symbol, 5)
    884 
    885     l = splitIdentifier(symbol)
    886     for word in l:
    887 	addWord(word, file, symbol, 10)
    888 
    889     return 1
    890 
    891 def analyzeAPISymbols(top):
    892     count = 0
    893     cur = top.children
    894         
    895     while cur != None:
    896         if cur.type == 'text':
    897 	    cur = cur.next
    898 	    continue
    899 	if cur.name == "macro":
    900 	    count = count + analyzeAPIMacro(cur)
    901 	elif cur.name == "function":
    902 	    count = count + analyzeAPIFunction(cur)
    903 	elif cur.name == "const":
    904 	    count = count + analyzeAPIConst(cur)
    905 	elif cur.name == "typedef":
    906 	    count = count + analyzeAPIType(cur)
    907 	elif cur.name == "struct":
    908 	    count = count + analyzeAPIStruct(cur)
    909 	elif cur.name == "enum":
    910 	    count = count + analyzeAPIEnum(cur)
    911 	elif cur.name == "functype":
    912 	    count = count + analyzeAPIFunctype(cur)
    913 	else:
    914 	    print "unexpected element %s in API doc <files>" % (cur.name)
    915         cur = cur.next
    916     return count
    917 
    918 def analyzeAPI(doc):
    919     count = 0
    920     if doc == None:
    921         return -1
    922     root = doc.getRootElement()
    923     if root.name != "api":
    924         print "Unexpected root name"
    925         return -1
    926     cur = root.children
    927     while cur != None:
    928         if cur.type == 'text':
    929 	    cur = cur.next
    930 	    continue
    931 	if cur.name == "files":
    932 	    pass
    933 #	    count = count + analyzeAPIFiles(cur)
    934 	elif cur.name == "symbols":
    935 	    count = count + analyzeAPISymbols(cur)
    936 	else:
    937 	    print "unexpected element %s in API doc" % (cur.name)
    938         cur = cur.next
    939     return count
    940 
    941 #########################################################################
    942 #									#
    943 #                  Web pages parsing and analysis			#
    944 #									#
    945 #########################################################################
    946 
    947 import glob
    948 
    949 def analyzeHTMLText(doc, resource, p, section, id):
    950     words = 0
    951     try:
    952 	content = p.content
    953 	words = words + addStringHTML(content, resource, id, section, 5)
    954     except:
    955         return -1
    956     return words
    957 
    958 def analyzeHTMLPara(doc, resource, p, section, id):
    959     words = 0
    960     try:
    961 	content = p.content
    962 	words = words + addStringHTML(content, resource, id, section, 5)
    963     except:
    964         return -1
    965     return words
    966 
    967 def analyzeHTMLPre(doc, resource, p, section, id):
    968     words = 0
    969     try:
    970 	content = p.content
    971 	words = words + addStringHTML(content, resource, id, section, 5)
    972     except:
    973         return -1
    974     return words
    975 
    976 def analyzeHTML(doc, resource, p, section, id):
    977     words = 0
    978     try:
    979 	content = p.content
    980 	words = words + addStringHTML(content, resource, id, section, 5)
    981     except:
    982         return -1
    983     return words
    984 
    985 def analyzeHTML(doc, resource):
    986     para = 0;
    987     ctxt = doc.xpathNewContext()
    988     try:
    989 	res = ctxt.xpathEval("//head/title")
    990 	title = res[0].content
    991     except:
    992         title = "Page %s" % (resource)
    993     addPage(resource, title)
    994     try:
    995 	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
    996 	section = title
    997 	id = ""
    998 	for item in items:
    999 	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
   1000 	        section = item.content
   1001 		if item.prop("id"):
   1002 		    id = item.prop("id")
   1003 		elif item.prop("name"):
   1004 		    id = item.prop("name")
   1005 	    elif item.type == 'text':
   1006 	        analyzeHTMLText(doc, resource, item, section, id)
   1007 		para = para + 1
   1008 	    elif item.name == 'p':
   1009 	        analyzeHTMLPara(doc, resource, item, section, id)
   1010 		para = para + 1
   1011 	    elif item.name == 'pre':
   1012 	        analyzeHTMLPre(doc, resource, item, section, id)
   1013 		para = para + 1
   1014 	    else:
   1015 	        print "Page %s, unexpected %s element" % (resource, item.name)
   1016     except:
   1017         print "Page %s: problem analyzing" % (resource)
   1018 	print sys.exc_type, sys.exc_value
   1019 
   1020     return para
   1021 
   1022 def analyzeHTMLPages():
   1023     ret = 0
   1024     HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
   1025     for html in HTMLfiles:
   1026 	if html[0:3] == "API":
   1027 	    continue
   1028 	if html == "xml.html":
   1029 	    continue
   1030 	try:
   1031 	    doc = libxml2.parseFile(html)
   1032 	except:
   1033 	    doc = libxml2.htmlParseFile(html, None)
   1034 	try:
   1035 	    res = analyzeHTML(doc, html)
   1036 	    print "Parsed %s : %d paragraphs" % (html, res)
   1037 	    ret = ret + 1
   1038 	except:
   1039 	    print "could not parse %s" % (html)
   1040     return ret
   1041 
   1042 #########################################################################
   1043 #									#
   1044 #                  Mail archives parsing and analysis			#
   1045 #									#
   1046 #########################################################################
   1047 
   1048 import time
   1049 
   1050 def getXMLDateArchive(t = None):
   1051     if t == None:
   1052 	t = time.time()
   1053     T = time.gmtime(t)
   1054     month = time.strftime("%B", T)
   1055     year = T[0]
   1056     url = "http://mail.gnome.org/archives/xml/%d-%s/date.html" % (year, month)
   1057     return url
   1058 
   1059 def scanXMLMsgArchive(url, title, force = 0):
   1060     if url == None or title == None:
   1061         return 0
   1062 
   1063     ID = checkXMLMsgArchive(url)
   1064     if force == 0 and ID != -1:
   1065         return 0
   1066 
   1067     if ID == -1:
   1068 	ID = addXMLMsgArchive(url, title)
   1069 	if ID == -1:
   1070 	    return 0
   1071 
   1072     try:
   1073         print "Loading %s" % (url)
   1074         doc = libxml2.htmlParseFile(url, None);
   1075     except:
   1076         doc = None
   1077     if doc == None:
   1078         print "Failed to parse %s" % (url)
   1079 	return 0
   1080 
   1081     addStringArchive(title, ID, 20)
   1082     ctxt = doc.xpathNewContext()
   1083     texts = ctxt.xpathEval("//pre//text()")
   1084     for text in texts:
   1085         addStringArchive(text.content, ID, 5)
   1086 
   1087     return 1
   1088 
   1089 def scanXMLDateArchive(t = None, force = 0):
   1090     global wordsDictArchive
   1091 
   1092     wordsDictArchive = {}
   1093 
   1094     url = getXMLDateArchive(t)
   1095     print "loading %s" % (url)
   1096     try:
   1097 	doc = libxml2.htmlParseFile(url, None);
   1098     except:
   1099         doc = None
   1100     if doc == None:
   1101         print "Failed to parse %s" % (url)
   1102 	return -1
   1103     ctxt = doc.xpathNewContext()
   1104     anchors = ctxt.xpathEval("//a[@href]")
   1105     links = 0
   1106     newmsg = 0
   1107     for anchor in anchors:
   1108 	href = anchor.prop("href")
   1109 	if href == None or href[0:3] != "msg":
   1110 	    continue
   1111         try:
   1112 	    links = links + 1
   1113 
   1114 	    msg = libxml2.buildURI(href, url)
   1115 	    title = anchor.content
   1116 	    if title != None and title[0:4] == 'Re: ':
   1117 	        title = title[4:]
   1118 	    if title != None and title[0:6] == '[xml] ':
   1119 	        title = title[6:]
   1120 	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
   1121 
   1122 	except:
   1123 	    pass
   1124 
   1125     return newmsg
   1126     
   1127 
   1128 #########################################################################
   1129 #									#
   1130 #          Main code: open the DB, the API XML and analyze it		#
   1131 #									#
   1132 #########################################################################
   1133 def analyzeArchives(t = None, force = 0):
   1134     global wordsDictArchive
   1135 
   1136     ret = scanXMLDateArchive(t, force)
   1137     print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
   1138 
   1139     i = 0
   1140     skipped = 0
   1141     for word in wordsDictArchive.keys():
   1142 	refs = wordsDictArchive[word]
   1143 	if refs  == None:
   1144 	    skipped = skipped + 1
   1145 	    continue;
   1146 	for id in refs.keys():
   1147 	    relevance = refs[id]
   1148 	    updateWordArchive(word, id, relevance)
   1149 	    i = i + 1
   1150 
   1151     print "Found %d associations in HTML pages" % (i)
   1152 
   1153 def analyzeHTMLTop():
   1154     global wordsDictHTML
   1155 
   1156     ret = analyzeHTMLPages()
   1157     print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
   1158 
   1159     i = 0
   1160     skipped = 0
   1161     for word in wordsDictHTML.keys():
   1162 	refs = wordsDictHTML[word]
   1163 	if refs  == None:
   1164 	    skipped = skipped + 1
   1165 	    continue;
   1166 	for resource in refs.keys():
   1167 	    (relevance, id, section) = refs[resource]
   1168 	    updateWordHTML(word, resource, section, id, relevance)
   1169 	    i = i + 1
   1170 
   1171     print "Found %d associations in HTML pages" % (i)
   1172 
   1173 def analyzeAPITop():
   1174     global wordsDict
   1175     global API
   1176 
   1177     try:
   1178 	doc = loadAPI(API)
   1179 	ret = analyzeAPI(doc)
   1180 	print "Analyzed %d blocs" % (ret)
   1181 	doc.freeDoc()
   1182     except:
   1183 	print "Failed to parse and analyze %s" % (API)
   1184 	print sys.exc_type, sys.exc_value
   1185 	sys.exit(1)
   1186 
   1187     print "Indexed %d words" % (len(wordsDict))
   1188     i = 0
   1189     skipped = 0
   1190     for word in wordsDict.keys():
   1191 	refs = wordsDict[word]
   1192 	if refs  == None:
   1193 	    skipped = skipped + 1
   1194 	    continue;
   1195 	for (module, symbol) in refs.keys():
   1196 	    updateWord(word, symbol, refs[(module, symbol)])
   1197 	    i = i + 1
   1198 
   1199     print "Found %d associations, skipped %d words" % (i, skipped)
   1200 
   1201 def usage():
   1202     print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
   1203     sys.exit(1)
   1204 
   1205 def main():
   1206     try:
   1207 	openMySQL()
   1208     except:
   1209 	print "Failed to open the database"
   1210 	print sys.exc_type, sys.exc_value
   1211 	sys.exit(1)
   1212 
   1213     args = sys.argv[1:]
   1214     force = 0
   1215     if args:
   1216         i = 0
   1217 	while i < len(args):
   1218 	    if args[i] == '--force':
   1219 	        force = 1
   1220 	    elif args[i] == '--archive':
   1221 	        analyzeArchives(None, force)
   1222 	    elif args[i] == '--archive-year':
   1223 	        i = i + 1;
   1224 		year = args[i]
   1225 		months = ["January" , "February", "March", "April", "May",
   1226 			  "June", "July", "August", "September", "October",
   1227 			  "November", "December"];
   1228 	        for month in months:
   1229 		    try:
   1230 		        str = "%s-%s" % (year, month)
   1231 			T = time.strptime(str, "%Y-%B")
   1232 			t = time.mktime(T) + 3600 * 24 * 10;
   1233 			analyzeArchives(t, force)
   1234 		    except:
   1235 			print "Failed to index month archive:"
   1236 			print sys.exc_type, sys.exc_value
   1237 	    elif args[i] == '--archive-month':
   1238 	        i = i + 1;
   1239 		month = args[i]
   1240 		try:
   1241 		    T = time.strptime(month, "%Y-%B")
   1242 		    t = time.mktime(T) + 3600 * 24 * 10;
   1243 		    analyzeArchives(t, force)
   1244 		except:
   1245 		    print "Failed to index month archive:"
   1246 		    print sys.exc_type, sys.exc_value
   1247 	    elif args[i] == '--API':
   1248 	        analyzeAPITop()
   1249 	    elif args[i] == '--docs':
   1250 	        analyzeHTMLTop()
   1251 	    else:
   1252 	        usage()
   1253 	    i = i + 1
   1254     else:
   1255         usage()
   1256 
   1257 if __name__ == "__main__":
   1258     main()
   1259