Home | History | Annotate | Download | only in py_vulcanize
      1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import os
      6 import sys
      7 
      8 from py_vulcanize import module
      9 from py_vulcanize import strip_js_comments
     10 from py_vulcanize import html_generation_controller
     11 
     12 
     13 def _AddToPathIfNeeded(path):
     14   if path not in sys.path:
     15     sys.path.insert(0, path)
     16 
     17 
     18 def _InitBeautifulSoup():
     19   catapult_path = os.path.abspath(
     20       os.path.join(os.path.dirname(__file__),
     21                    os.path.pardir, os.path.pardir, os.path.pardir))
     22   bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
     23   _AddToPathIfNeeded(bs_path)
     24 
     25   html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
     26   _AddToPathIfNeeded(html5lib_path)
     27 
     28   six_path = os.path.join(catapult_path, 'third_party', 'six')
     29   _AddToPathIfNeeded(six_path)
     30 
     31 
     32 _InitBeautifulSoup()
     33 import bs4
     34 
     35 
     36 class InlineScript(object):
     37 
     38   def __init__(self, soup):
     39     if not soup:
     40       raise module.DepsException('InlineScript created without soup')
     41     self._soup = soup
     42     self._stripped_contents = None
     43     self._open_tags = None
     44 
     45   @property
     46   def contents(self):
     47     return unicode(self._soup.string)
     48 
     49   @property
     50   def stripped_contents(self):
     51     if not self._stripped_contents:
     52       self._stripped_contents = strip_js_comments.StripJSComments(
     53           self.contents)
     54     return self._stripped_contents
     55 
     56   @property
     57   def open_tags(self):
     58     if self._open_tags:
     59       return self._open_tags
     60     open_tags = []
     61     cur = self._soup.parent
     62     while cur:
     63       if isinstance(cur, bs4.BeautifulSoup):
     64         break
     65 
     66       open_tags.append(_Tag(cur.name, cur.attrs))
     67       cur = cur.parent
     68 
     69     open_tags.reverse()
     70     assert open_tags[-1].tag == 'script'
     71     del open_tags[-1]
     72 
     73     self._open_tags = open_tags
     74     return self._open_tags
     75 
     76 
     77 def _CreateSoupWithoutHeadOrBody(html):
     78   soupCopy = bs4.BeautifulSoup(html, 'html5lib')
     79   soup = bs4.BeautifulSoup()
     80   soup.reset()
     81   if soupCopy.head:
     82     for n in soupCopy.head.contents:
     83       n.extract()
     84       soup.append(n)
     85   if soupCopy.body:
     86     for n in soupCopy.body.contents:
     87       n.extract()
     88       soup.append(n)
     89   return soup
     90 
     91 
     92 class HTMLModuleParserResults(object):
     93 
     94   def __init__(self, html):
     95     self._soup = bs4.BeautifulSoup(html, 'html5lib')
     96     self._inline_scripts = None
     97 
     98   @property
     99   def scripts_external(self):
    100     tags = self._soup.findAll('script', src=True)
    101     return [t['src'] for t in tags]
    102 
    103   @property
    104   def inline_scripts(self):
    105     if not self._inline_scripts:
    106       tags = self._soup.findAll('script', src=None)
    107       self._inline_scripts = [InlineScript(t.string) for t in tags]
    108     return self._inline_scripts
    109 
    110   @property
    111   def imports(self):
    112     tags = self._soup.findAll('link', rel='import')
    113     return [t['href'] for t in tags]
    114 
    115   @property
    116   def stylesheets(self):
    117     tags = self._soup.findAll('link', rel='stylesheet')
    118     return [t['href'] for t in tags]
    119 
    120   @property
    121   def inline_stylesheets(self):
    122     tags = self._soup.findAll('style')
    123     return [unicode(t.string) for t in tags]
    124 
    125   def YieldHTMLInPieces(self, controller, minify=False):
    126     yield self.GenerateHTML(controller, minify)
    127 
    128   def GenerateHTML(self, controller, minify=False, prettify=False):
    129     soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))
    130 
    131     # Remove declaration.
    132     for x in soup.contents:
    133       if isinstance(x, bs4.Doctype):
    134         x.extract()
    135 
    136     # Remove declaration.
    137     for x in soup.contents:
    138       if isinstance(x, bs4.Declaration):
    139         x.extract()
    140 
    141     # Remove all imports.
    142     imports = soup.findAll('link', rel='import')
    143     for imp in imports:
    144       imp.extract()
    145 
    146     # Remove all script links.
    147     scripts_external = soup.findAll('script', src=True)
    148     for script in scripts_external:
    149       script.extract()
    150 
    151     # Remove all in-line scripts.
    152     scripts_external = soup.findAll('script', src=None)
    153     for script in scripts_external:
    154       script.extract()
    155 
    156     # Process all in-line styles.
    157     inline_styles = soup.findAll('style')
    158     for style in inline_styles:
    159       html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
    160       if html:
    161         ns = soup.new_tag('style')
    162         ns.append(bs4.NavigableString(html))
    163         style.replaceWith(ns)
    164       else:
    165         style.extract()
    166 
    167     # Rewrite all external stylesheet hrefs or remove, as needed.
    168     stylesheet_links = soup.findAll('link', rel='stylesheet')
    169     for stylesheet_link in stylesheet_links:
    170       html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
    171       if html:
    172         tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
    173         assert len(tmp) == 1
    174         stylesheet_link.replaceWith(tmp[0])
    175       else:
    176         stylesheet_link.extract()
    177 
    178     # Remove comments if minifying.
    179     if minify:
    180       comments = soup.findAll(
    181           text=lambda text: isinstance(text, bs4.Comment))
    182       for comment in comments:
    183         comment.extract()
    184     if prettify:
    185       return soup.prettify('utf-8').strip()
    186 
    187     # We are done.
    188     return unicode(soup).strip()
    189 
    190   @property
    191   def html_contents_without_links_and_script(self):
    192     return self.GenerateHTML(
    193         html_generation_controller.HTMLGenerationController())
    194 
    195 
    196 class _Tag(object):
    197 
    198   def __init__(self, tag, attrs):
    199     self.tag = tag
    200     self.attrs = attrs
    201 
    202   def __repr__(self):
    203     attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
    204     return '<%s %s>' % (self.tag, attr_string)
    205 
    206 
    207 class HTMLModuleParser():
    208 
    209   def Parse(self, html):
    210     if html is None:
    211       html = ''
    212     else:
    213       if html.find('< /script>') != -1:
    214         raise Exception('Escape script tags with <\/script>')
    215 
    216     return HTMLModuleParserResults(html)
    217