Home | History | Annotate | Download | only in generator
      1 #!/usr/bin/python
      2 # Copyright 2015 The Chromium OS Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Module for parsing TCG TPM2 library specification in HTML format.
      7 
      8 This module processes parts 2 and 3 of the specification, extracting
      9 information related to tables defined in the documents, feeding the
     10 information into the Table object for further processing and creating the
     11 appropriate TPM2 objects.
     12 """
     13 
     14 from __future__ import print_function
     15 
     16 import HTMLParser
     17 import os
     18 import re
     19 import sys
     20 
     21 import tpm_table
     22 
     23 table_name = re.compile(r'^\s*Table\s+[0-9]+')
     24 
     25 
     26 class SpecParser(HTMLParser.HTMLParser):
     27   """A class for parsing TCG specifications in html format."""
     28 
     29   # The state machine of the parser could be in one of the following states.
     30   ANCHOR = 0       # Look for table title anchor
     31   TABLE_NAME = 1   # Look for table title in the data stream
     32   TABLE_BODY = 2   # Scraping the actual table body
     33   MAYBE_DONE = 3   # Could be over, unless a single spec table is split in
     34                    # multiple HTML tables (to continue on the next page)
     35   SKIP_HEADER = 4  # Ignore the header of the split tables
     36 
     37   def __init__(self):
     38     """Initialize a parser object to default state."""
     39     HTMLParser.HTMLParser.__init__(self)
     40     self._state = self.ANCHOR
     41     self._title = ''
     42     self._table = tpm_table.Table()
     43     self._previous_table_number = 0  # Used to check if there are skipped tables
     44 
     45   def _Normalize(self, data):
     46     """Normalize HTML data.
     47 
     48     HTML files generated from TCG specifications sometimes include utf8
     49     characters (like long dashes), which appear only in comments/table titles
     50     and can be safely ignored.
     51 
     52     Args:
     53      data: a string representing portion of data from the HTML being parsed.
     54 
     55     Returns:
     56       a string, the input data with characters above ASCII printable range
     57                  excluded.
     58     """
     59     return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128)
     60 
     61   def GetTable(self):
     62     """Return the Table object containing all information parsed so far."""
     63     return self._table
     64 
     65   def _SetState(self, new_state):
     66     if self._state != new_state:
     67       self._state = new_state
     68       if new_state == self.TABLE_NAME:
     69         self._title = ''
     70 
     71   def handle_starttag(self, tag, attrs):
     72     """Invoked each time a new HTML tag is opened.
     73 
     74     This method drives changes in the parser FSM states, its heuristics are
     75     derived from the format of the HTML files the TCG specs get converted to.
     76 
     77     Each specification table is preceded with a tittle. The title is wrapped
     78     in an anchor tag with a property 'name' set to 'bookmark#xxx. The title
     79     text starts with ' Table [0-9]+ '. Once the table title is detected,
     80     the state machine switches to looking for the actual HTML table, i.e. tags
     81     'table', 'tr' and 'td' (the generated specs do not use the 'th' tags).
     82 
     83     Large specification tables can be split into multiple HTML tables (so that
     84     they fit in a page). This is why the presence of the closing 'table' tag
     85     is not enough to close the parsing of the current specification table.
     86 
     87     In some cases the next table is defined in the spec immediately after the
     88     current one - this is when the new anchor tag is used as a signal that the
     89     previous table has been completely consumed.
     90 
     91     Args:
     92       tag: a string, the HTML tag
     93       attrs: a tuple of zero or more two-string tuples, the first element -
     94              the HTML tag's attribute, the second element - the attribute
     95              value.
     96     """
     97     if tag == 'a':
     98       if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]:
     99         if self._state == self.ANCHOR:
    100           self._SetState(self.TABLE_NAME)
    101         elif self._state == self.MAYBE_DONE:
    102           # Done indeed
    103           self._table.ProcessTable()
    104           self._table.Init()
    105           self._SetState(self.TABLE_NAME)
    106         elif self._state == self.TABLE_NAME:
    107           self._title = ''
    108     elif tag == 'p' and self._state == self.TABLE_NAME and not self._title:
    109       # This was not a valid table start, back to looking for the right anchor.
    110       self._SetState(self.ANCHOR)
    111     elif self._state == self.TABLE_NAME and tag == 'table':
    112       if not table_name.search(self._title):
    113         # Table title does not match the expected format - back to square one.
    114         self._SetState(self.ANCHOR)
    115         return  # will have to start over
    116       table_number = int(self._title.split()[1])
    117       self._previous_table_number += 1
    118       if table_number > self._previous_table_number:
    119         print('Table(s) %s missing' % ' '.join(
    120             '%d' % x for x in
    121             range(self._previous_table_number, table_number)), file=sys.stderr)
    122         self._previous_table_number = table_number
    123       self._table.Init(self._title)
    124       self._SetState(self.TABLE_BODY)
    125     elif self._state == self.MAYBE_DONE and tag == 'tr':
    126       self._SetState(self.SKIP_HEADER)
    127     elif self._state == self.SKIP_HEADER and tag == 'tr':
    128       self._SetState(self.TABLE_BODY)
    129       self._table.NewRow()
    130     elif self._state == self.TABLE_BODY:
    131       if tag == 'tr':
    132         self._table.NewRow()
    133       elif tag == 'td':
    134         self._table.NewCell()
    135 
    136   def handle_endtag(self, tag):
    137     """Invoked each time an HTML tag is closed."""
    138     if tag == 'table' and self._table.InProgress():
    139       self._SetState(self.MAYBE_DONE)
    140 
    141   def handle_data(self, data):
    142     """Process data outside HTML tags."""
    143     if self._state == self.TABLE_NAME:
    144       self._title += ' %s' % self._Normalize(data)
    145     elif self._state == self.TABLE_BODY:
    146       self._table.AddData(self._Normalize(data))
    147     elif self._state == self.MAYBE_DONE:
    148       # Done indeed
    149       self._table.ProcessTable()
    150       self._table.Init()
    151       self._SetState(self.ANCHOR)
    152 
    153   def close(self):
    154     """Finish processing of the HTML buffer."""
    155     if self._state in (self.TABLE_BODY, self.MAYBE_DONE):
    156       self._table.ProcessTable()
    157     self._state = self.ANCHOR
    158 
    159   def handle_entityref(self, name):
    160     """Process HTML escape sequence."""
    161     entmap = {
    162         'amp': '&',
    163         'gt': '>',
    164         'lt': '<',
    165         'quot': '"',
    166     }
    167     if name in entmap:
    168       if self._state == self.TABLE_BODY:
    169         self._table.AddData(entmap[name])
    170       elif self._state == self.TABLE_NAME:
    171         self._title += entmap[name]
    172 
    173 
    174 def main(structs_html_file_name):
    175   """When invoked standalone - dump .h file on the console."""
    176   parser = SpecParser()
    177   with open(structs_html_file_name) as input_file:
    178     html_content = input_file.read()
    179   parser.feed(html_content)
    180   parser.close()
    181   print(parser.GetTable().GetHFile())
    182 
    183 if __name__ == '__main__':
    184   if len(sys.argv) != 2:
    185     print('%s: One parameter is required, the name of the html file '
    186           'which is the TPM2 library Part 2 specification' %
    187           os.path.basename(sys.argv[0]), file=sys.stderr)
    188     sys.exit(1)
    189   main(sys.argv[1])
    190