Home | History | Annotate | Download | only in py_vulcanize
      1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import os
      6 import sys
      7 
      8 from py_vulcanize import js_utils
      9 from py_vulcanize import module
     10 from py_vulcanize import strip_js_comments
     11 from py_vulcanize import html_generation_controller
     12 
     13 
     14 def _AddToPathIfNeeded(path):
     15   if path not in sys.path:
     16     sys.path.insert(0, path)
     17 
     18 
     19 def _InitBeautifulSoup():
     20   catapult_path = os.path.abspath(
     21       os.path.join(os.path.dirname(__file__),
     22                    os.path.pardir, os.path.pardir, os.path.pardir))
     23   bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
     24   _AddToPathIfNeeded(bs_path)
     25 
     26   html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
     27   _AddToPathIfNeeded(html5lib_path)
     28 
     29   six_path = os.path.join(catapult_path, 'third_party', 'six')
     30   _AddToPathIfNeeded(six_path)
     31 
     32 
     33 _InitBeautifulSoup()
     34 import bs4
     35 
     36 class Script(object):
     37 
     38   def __init__(self, soup):
     39     if not soup:
     40       raise module.DepsException('Script object created without soup')
     41     self._soup = soup
     42 
     43   def AppendJSContentsToFile(self, f, *args, **kwargs):
     44     raise NotImplementedError()
     45 
     46 class InlineScript(Script):
     47 
     48   def __init__(self, soup):
     49     super(InlineScript, self).__init__(soup)
     50     self._stripped_contents = None
     51     self._open_tags = None
     52     self.is_external = False
     53 
     54   @property
     55   def contents(self):
     56     return unicode(self._soup.string)
     57 
     58   @property
     59   def stripped_contents(self):
     60     if not self._stripped_contents:
     61       self._stripped_contents = strip_js_comments.StripJSComments(
     62           self.contents)
     63     return self._stripped_contents
     64 
     65   @property
     66   def open_tags(self):
     67     if self._open_tags:
     68       return self._open_tags
     69     open_tags = []
     70     cur = self._soup.parent
     71     while cur:
     72       if isinstance(cur, bs4.BeautifulSoup):
     73         break
     74 
     75       open_tags.append(_Tag(cur.name, cur.attrs))
     76       cur = cur.parent
     77 
     78     open_tags.reverse()
     79     assert open_tags[-1].tag == 'script'
     80     del open_tags[-1]
     81 
     82     self._open_tags = open_tags
     83     return self._open_tags
     84 
     85   def AppendJSContentsToFile(self, f, *args, **kwargs):
     86     js = self.contents
     87     escaped_js = js_utils.EscapeJSIfNeeded(js)
     88     f.write(escaped_js)
     89     f.write('\n')
     90 
     91 class ExternalScript(Script):
     92 
     93   def __init__(self, soup):
     94     super(ExternalScript, self).__init__(soup)
     95     if 'src' not in soup.attrs:
     96       raise Exception("{0} is not an external script.".format(soup))
     97     self.is_external = True
     98     self._loaded_raw_script = None
     99 
    100   @property
    101   def loaded_raw_script(self):
    102     if self._loaded_raw_script:
    103       return self._loaded_raw_script
    104 
    105     return None
    106 
    107   @loaded_raw_script.setter
    108   def loaded_raw_script(self, value):
    109     self._loaded_raw_script = value
    110 
    111   @property
    112   def src(self):
    113     return self._soup.attrs['src']
    114 
    115   def AppendJSContentsToFile(self,
    116                              f,
    117                              use_include_tags_for_scripts,
    118                              dir_for_include_tag_root):
    119     raw_script = self.loaded_raw_script
    120     if not raw_script:
    121       return
    122 
    123     if use_include_tags_for_scripts:
    124       rel_filename = os.path.relpath(raw_script.filename,
    125                                     dir_for_include_tag_root)
    126       f.write("""<include src="%s">\n""" % rel_filename)
    127     else:
    128       f.write(js_utils.EscapeJSIfNeeded(raw_script.contents))
    129       f.write('\n')
    130 
    131 def _CreateSoupWithoutHeadOrBody(html):
    132   soupCopy = bs4.BeautifulSoup(html, 'html5lib')
    133   soup = bs4.BeautifulSoup()
    134   soup.reset()
    135   if soupCopy.head:
    136     for n in soupCopy.head.contents:
    137       n.extract()
    138       soup.append(n)
    139   if soupCopy.body:
    140     for n in soupCopy.body.contents:
    141       n.extract()
    142       soup.append(n)
    143   return soup
    144 
    145 
    146 class HTMLModuleParserResults(object):
    147 
    148   def __init__(self, html):
    149     self._soup = bs4.BeautifulSoup(html, 'html5lib')
    150     self._inline_scripts = None
    151     self._scripts = None
    152 
    153   @property
    154   def scripts_external(self):
    155     tags = self._soup.findAll('script', src=True)
    156     return [t['src'] for t in tags]
    157 
    158   @property
    159   def inline_scripts(self):
    160     if not self._inline_scripts:
    161       tags = self._soup.findAll('script', src=None)
    162       self._inline_scripts = [InlineScript(t.string) for t in tags]
    163     return self._inline_scripts
    164 
    165   @property
    166   def scripts(self):
    167     if not self._scripts:
    168       self._scripts = []
    169       script_elements = self._soup.findAll('script')
    170       for element in script_elements:
    171         if 'src' in element.attrs:
    172           self._scripts.append(ExternalScript(element))
    173         else:
    174           self._scripts.append(InlineScript(element))
    175     return self._scripts
    176 
    177   @property
    178   def imports(self):
    179     tags = self._soup.findAll('link', rel='import')
    180     return [t['href'] for t in tags]
    181 
    182   @property
    183   def stylesheets(self):
    184     tags = self._soup.findAll('link', rel='stylesheet')
    185     return [t['href'] for t in tags]
    186 
    187   @property
    188   def inline_stylesheets(self):
    189     tags = self._soup.findAll('style')
    190     return [unicode(t.string) for t in tags]
    191 
    192   def YieldHTMLInPieces(self, controller, minify=False):
    193     yield self.GenerateHTML(controller, minify)
    194 
    195   def GenerateHTML(self, controller, minify=False, prettify=False):
    196     soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))
    197 
    198     # Remove declaration.
    199     for x in soup.contents:
    200       if isinstance(x, bs4.Doctype):
    201         x.extract()
    202 
    203     # Remove declaration.
    204     for x in soup.contents:
    205       if isinstance(x, bs4.Declaration):
    206         x.extract()
    207 
    208     # Remove all imports.
    209     imports = soup.findAll('link', rel='import')
    210     for imp in imports:
    211       imp.extract()
    212 
    213     # Remove all script links.
    214     scripts_external = soup.findAll('script', src=True)
    215     for script in scripts_external:
    216       script.extract()
    217 
    218     # Remove all in-line scripts.
    219     scripts_external = soup.findAll('script', src=None)
    220     for script in scripts_external:
    221       script.extract()
    222 
    223     # Process all in-line styles.
    224     inline_styles = soup.findAll('style')
    225     for style in inline_styles:
    226       html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
    227       if html:
    228         ns = soup.new_tag('style')
    229         ns.append(bs4.NavigableString(html))
    230         style.replaceWith(ns)
    231       else:
    232         style.extract()
    233 
    234     # Rewrite all external stylesheet hrefs or remove, as needed.
    235     stylesheet_links = soup.findAll('link', rel='stylesheet')
    236     for stylesheet_link in stylesheet_links:
    237       html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
    238       if html:
    239         tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
    240         assert len(tmp) == 1
    241         stylesheet_link.replaceWith(tmp[0])
    242       else:
    243         stylesheet_link.extract()
    244 
    245     # Remove comments if minifying.
    246     if minify:
    247       comments = soup.findAll(
    248           text=lambda text: isinstance(text, bs4.Comment))
    249       for comment in comments:
    250         comment.extract()
    251     if prettify:
    252       return soup.prettify('utf-8').strip()
    253 
    254     # We are done.
    255     return unicode(soup).strip()
    256 
    257   @property
    258   def html_contents_without_links_and_script(self):
    259     return self.GenerateHTML(
    260         html_generation_controller.HTMLGenerationController())
    261 
    262 
    263 class _Tag(object):
    264 
    265   def __init__(self, tag, attrs):
    266     self.tag = tag
    267     self.attrs = attrs
    268 
    269   def __repr__(self):
    270     attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
    271     return '<%s %s>' % (self.tag, attr_string)
    272 
    273 
    274 class HTMLModuleParser():
    275 
    276   def Parse(self, html):
    277     if html is None:
    278       html = ''
    279     else:
    280       if html.find('< /script>') != -1:
    281         raise Exception('Escape script tags with <\/script>')
    282 
    283     return HTMLModuleParserResults(html)
    284