Home | History | Annotate | Download | only in documentation
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 '''This utility cleans up the html files as emitted by doxygen so
      7 that they are suitable for publication on a Google documentation site.
      8 '''
      9 
     10 import optparse
     11 import os
     12 import re
     13 import shutil
     14 import string
     15 import sys
     16 try:
     17   from BeautifulSoup import BeautifulSoup, Tag
     18 except (ImportError, NotImplementedError):
     19   print ("This tool requires the BeautifulSoup package "
     20          "(see http://www.crummy.com/software/BeautifulSoup/).\n"
     21          "Make sure that the file BeautifulSoup.py is either in this directory "
     22          "or is available in your PYTHON_PATH")
     23   raise
     24 
     25 
     26 class HTMLFixer(object):
     27   '''This class cleans up the html strings as produced by Doxygen
     28   '''
     29 
     30   def __init__(self, html):
     31     self.soup = BeautifulSoup(html)
     32 
     33   def FixTableHeadings(self):
     34     '''Fixes the doxygen table headings.
     35 
     36     This includes:
     37       - Using bare <h2> title row instead of row embedded in <tr><td> in table
     38       - Putting the "name" attribute into the "id" attribute of the <tr> tag.
     39       - Splitting up tables into multiple separate tables if a table
     40         heading appears in the middle of a table.
     41 
     42     For example, this html:
     43      <table>
     44       <tr><td colspan="2"><h2><a name="pub-attribs"></a>
     45       Data Fields List</h2></td></tr>
     46       ...
     47      </table>
     48 
     49     would be converted to this:
     50      <h2>Data Fields List</h2>
     51      <table>
     52       ...
     53      </table>
     54     '''
     55 
     56     table_headers = []
     57     for tag in self.soup.findAll('tr'):
     58       if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
     59         #tag['id'] = tag.td.h2.a['name']
     60         tag.string = tag.td.h2.a.next
     61         tag.name = 'h2'
     62         table_headers.append(tag)
     63 
     64     # reverse the list so that earlier tags don't delete later tags
     65     table_headers.reverse()
     66     # Split up tables that have multiple table header (th) rows
     67     for tag in table_headers:
     68       print "Header tag: %s is %s" % (tag.name, tag.string.strip())
     69       # Is this a heading in the middle of a table?
     70       if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
     71         print "Splitting Table named %s" % tag.string.strip()
     72         table = tag.parent
     73         table_parent = table.parent
     74         table_index = table_parent.contents.index(table)
     75         new_table = Tag(self.soup, name='table', attrs=table.attrs)
     76         table_parent.insert(table_index + 1, new_table)
     77         tag_index = table.contents.index(tag)
     78         for index, row in enumerate(table.contents[tag_index:]):
     79           new_table.insert(index, row)
     80       # Now move the <h2> tag to be in front of the <table> tag
     81       assert tag.parent.name == 'table'
     82       table = tag.parent
     83       table_parent = table.parent
     84       table_index = table_parent.contents.index(table)
     85       table_parent.insert(table_index, tag)
     86 
     87   def RemoveTopHeadings(self):
     88     '''Removes <div> sections with a header, tabs, or navpath class attribute'''
     89     header_tags = self.soup.findAll(
     90         name='div',
     91         attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')})
     92     [tag.extract() for tag in header_tags]
     93 
     94   def FixAll(self):
     95     self.FixTableHeadings()
     96     self.RemoveTopHeadings()
     97 
     98   def __str__(self):
     99     return str(self.soup)
    100 
    101 
    102 def main():
    103   '''Main entry for the doxy_cleanup utility
    104 
    105   doxy_cleanup takes a list of html files and modifies them in place.'''
    106 
    107   parser = optparse.OptionParser(usage='Usage: %prog [options] files...')
    108 
    109   parser.add_option('-m', '--move', dest='move', action='store_true',
    110                     default=False, help='move html files to "original_html"')
    111 
    112   options, files = parser.parse_args()
    113 
    114   if not files:
    115     parser.print_usage()
    116     return 1
    117 
    118   for filename in files:
    119     try:
    120       with open(filename, 'r') as file:
    121         html = file.read()
    122 
    123       print "Processing %s" % filename
    124       fixer = HTMLFixer(html)
    125       fixer.FixAll()
    126       with open(filename, 'w') as file:
    127         file.write(str(fixer))
    128       if options.move:
    129         new_directory = os.path.join(
    130             os.path.dirname(os.path.dirname(filename)), 'original_html')
    131         if not os.path.exists(new_directory):
    132           os.mkdir(new_directory)
    133         shutil.move(filename, new_directory)
    134     except:
    135       print "Error while processing %s" % filename
    136       raise
    137 
    138   return 0
    139 
    140 
    141 if __name__ == '__main__':
    142   sys.exit(main())
    143