Home | History | Annotate | Download | only in server2
      1 # Copyright 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 from HTMLParser import HTMLParser
      6 
      7 
      8 class ParseResult(object):
      9   '''The result of |ParseDocument|:
     10   |title|             The title of the page, as pulled from the first <h1>.
     11   |title_attributes|  The attributes of the <h1> tag the title is derived from.
     12   |sections|          The list of Sections within this document.
     13   |warnings|          Any warnings while parsing the document.
     14   '''
     15 
     16   def __init__(self, title, title_attributes, sections, warnings):
     17     self.title = title
     18     self.title_attributes = title_attributes
     19     self.sections = sections
     20     self.warnings = warnings
     21 
     22 
     23 class DocumentSection(object):
     24   '''A section of the document as grouped by <section>...</section>. Any content
     25   not within section tags is considered an implicit section, so:
     26   "Foo <section>Bar</section> Baz" is 3 sections.
     27   |structure|  A list of DocumentStructureEntry for each top-level heading.
     28   '''
     29 
     30   def __init__(self):
     31     self.structure = []
     32 
     33 
     34 class DocumentStructureEntry(object):
     35   '''An entry in the document structure.
     36   |attributes| The attributes of the header tag this entry is derived from.
     37   |name|       The name of this entry, as pulled from the header tag this entry
     38                is derived from.
     39   |entries|    A list of child DocumentStructureEntry items.
     40   '''
     41 
     42   def __init__(self, tag, attributes):
     43     self.attributes = attributes
     44     self.name = ''
     45     self.entries = []
     46     # Callers shouldn't care about the tag, but we need it for sanity checking,
     47     # so make it private. In particular we pretend that anything but the first
     48     # h1 is an h2, and it'd be odd to expose that.
     49     self._tag = tag
     50     # Documents can override the name of the entry using title="".
     51     self._has_explicit_name = False
     52 
     53   def __repr__(self):
     54     return '<%s>%s</%s>' % (self._tag, self.name, self._tag)
     55 
     56   def __str__(self):
     57     return repr(self)
     58 
     59 
     60 def ParseDocument(document, expect_title=False):
     61   '''Parses the title and a document structure form |document| and returns a
     62   ParseResult.
     63   '''
     64   parser = _DocumentParser(expect_title)
     65   parser.feed(document)
     66   parser.close()
     67   return parser.parse_result
     68 
     69 
     70 def RemoveTitle(document):
     71   '''Removes the first <h1>..</h1> tag found in |document| and returns a
     72   (result, warning) tuple.
     73 
     74   If no title is found or |document| is malformed in some way, returns the
     75   original document and a warning message. Otherwise, returns the result of
     76   removing the title from |document| with a None warning message.
     77   '''
     78 
     79   def min_index(lhs, rhs):
     80     lhs_index, rhs_index = document.find(lhs), document.find(rhs)
     81     if lhs_index == -1: return rhs_index
     82     if rhs_index == -1: return lhs_index
     83     return min(lhs_index, rhs_index)
     84 
     85   title_start = min_index('<h1', '<H1')
     86   if title_start == -1:
     87     return document, 'No opening <h1> was found'
     88   title_end = min_index('/h1>', '/H1>')
     89   if title_end == -1:
     90     return document, 'No closing </h1> was found'
     91   if title_end < title_start:
     92     return document, 'The </h1> appeared before the <h1>'
     93 
     94   return (document[:title_start] + document[title_end + 4:], None)
     95 
     96 
     97 _HEADER_TAGS = ['h2', 'h3', 'h4']
     98 
     99 
    100 class _DocumentParser(HTMLParser):
    101   '''HTMLParser for ParseDocument.
    102   '''
    103 
    104   def __init__(self, expect_title):
    105     HTMLParser.__init__(self)
    106     # Public.
    107     self.parse_result = None
    108     # Private.
    109     self._expect_title = expect_title
    110     self._title_entry = None
    111     self._sections = []
    112     self._processing_section = DocumentSection()
    113     self._processing_entry = None
    114     self._warnings = []
    115 
    116   def handle_starttag(self, tag, attrs):
    117     if tag == 'section':
    118       self._OnSectionBoundary()
    119       return
    120 
    121     if tag != 'h1' and tag not in _HEADER_TAGS:
    122       return
    123 
    124     if self._processing_entry is not None:
    125       self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
    126                              (tag, self._processing_entry._tag))
    127       return
    128 
    129     attrs_dict = dict(attrs)
    130     self._processing_entry = DocumentStructureEntry(tag, attrs_dict)
    131 
    132     explicit_name = attrs_dict.pop('title', None)
    133     if explicit_name == '':
    134       # Don't create a TOC entry at all if the tag has specified title="".
    135       return
    136     if explicit_name is not None:
    137       self._processing_entry.name = explicit_name
    138       self._processing_entry._has_explicit_name = True
    139 
    140     if tag == 'h1' and self._title_entry is not None:
    141       self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags '
    142                              'will be classified as <h2> for the purpose of '
    143                              'the structure')
    144       tag = 'h2'
    145 
    146     if tag == 'h1':
    147       self._title_entry = self._processing_entry
    148     else:
    149       belongs_to = self._processing_section.structure
    150       for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
    151         if len(belongs_to) == 0:
    152           # TODO(kalman): Re-enable this warning once the reference pages have
    153           # their references fixed.
    154           #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
    155           #                       (tag, header))
    156           break
    157         belongs_to = belongs_to[-1].entries
    158       belongs_to.append(self._processing_entry)
    159 
    160   def handle_endtag(self, tag):
    161     if tag == 'section':
    162       self._OnSectionBoundary()
    163       return
    164 
    165     if tag != 'h1' and tag not in _HEADER_TAGS:
    166       return
    167 
    168     if self._processing_entry is None:
    169       self._WarnWithPosition('Found closing </%s> without an opening <%s>' %
    170                              (tag, tag))
    171       return
    172 
    173     if self._processing_entry._tag != tag:
    174       self._WarnWithPosition('Found closing </%s> while processing a <%s>' %
    175                              (tag, self._processing_entry._tag))
    176       # Note: no early return, it's more likely that the mismatched header was
    177       # a typo rather than a misplaced closing header tag.
    178 
    179     self._processing_entry = None
    180 
    181   def handle_data(self, data):
    182     if (self._processing_entry is not None and
    183         not self._processing_entry._has_explicit_name):
    184       # += is inefficient, but probably fine here because the chances of a
    185       # large number of nested tags within header tags is pretty low.
    186       self._processing_entry.name += data
    187 
    188   def close(self):
    189     HTMLParser.close(self)
    190 
    191     self._OnSectionBoundary()
    192 
    193     if self._processing_entry is not None:
    194       self._warnings.append('Finished parsing while still processing a <%s>' %
    195                             parser._processing_entry._tag)
    196 
    197     if self._expect_title:
    198       if not self._title_entry:
    199         self._warnings.append('Expected a title')
    200         title, title_attributes = '', {}
    201       else:
    202         title, title_attributes = (
    203             self._title_entry.name, self._title_entry.attributes)
    204     else:
    205       if self._title_entry:
    206         self._warnings.append('Found unexpected title "%s"' %
    207                               self._title_entry.name)
    208       title, title_attributes = None, None
    209 
    210     self.parse_result = ParseResult(
    211         title, title_attributes, self._sections, self._warnings)
    212 
    213   def _OnSectionBoundary(self):
    214     # Only start a new section if the previous section was non-empty.
    215     if self._processing_section.structure:
    216       self._sections.append(self._processing_section)
    217       self._processing_section = DocumentSection()
    218 
    219   def _WarnWithPosition(self, message):
    220     line, col = self.getpos()
    221     self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))
    222