py_vulcanize/py_vulcanize/parse_html_deps.py

# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import os
import sys

from py_vulcanize import module
from py_vulcanize import strip_js_comments
from py_vulcanize import html_generation_controller


def _AddToPathIfNeeded(path):
  if path not in sys.path:
    sys.path.insert(0, path)


def _InitBeautifulSoup():
  catapult_path = os.path.abspath(
      os.path.join(os.path.dirname(__file__),
                   os.path.pardir, os.path.pardir, os.path.pardir))
  bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
  _AddToPathIfNeeded(bs_path)

  html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
  _AddToPathIfNeeded(html5lib_path)

  six_path = os.path.join(catapult_path, 'third_party', 'six')
  _AddToPathIfNeeded(six_path)


_InitBeautifulSoup()
import bs4


class InlineScript(object):

  def __init__(self, soup):
    if not soup:
      raise module.DepsException('InlineScript created without soup')
    self._soup = soup
    self._stripped_contents = None
    self._open_tags = None

  @property
  def contents(self):
    return unicode(self._soup.string)

  @property
  def stripped_contents(self):
    if not self._stripped_contents:
      self._stripped_contents = strip_js_comments.StripJSComments(
          self.contents)
    return self._stripped_contents

  @property
  def open_tags(self):
    if self._open_tags:
      return self._open_tags
    open_tags = []
    cur = self._soup.parent
    while cur:
      if isinstance(cur, bs4.BeautifulSoup):
        break

      open_tags.append(_Tag(cur.name, cur.attrs))
      cur = cur.parent

    open_tags.reverse()
    assert open_tags[-1].tag == 'script'
    del open_tags[-1]

    self._open_tags = open_tags
    return self._open_tags


def _CreateSoupWithoutHeadOrBody(html):
  soupCopy = bs4.BeautifulSoup(html, 'html5lib')
  soup = bs4.BeautifulSoup()
  soup.reset()
  if soupCopy.head:
    for n in soupCopy.head.contents:
      n.extract()
      soup.append(n)
  if soupCopy.body:
    for n in soupCopy.body.contents:
      n.extract()
      soup.append(n)
  return soup


class HTMLModuleParserResults(object):

  def __init__(self, html):
    self._soup = bs4.BeautifulSoup(html, 'html5lib')
    self._inline_scripts = None

  @property
  def scripts_external(self):
    tags = self._soup.findAll('script', src=True)
    return [t['src'] for t in tags]

  @property
  def inline_scripts(self):
    if not self._inline_scripts:
      tags = self._soup.findAll('script', src=None)
      self._inline_scripts = [InlineScript(t.string) for t in tags]
    return self._inline_scripts

  @property
  def imports(self):
    tags = self._soup.findAll('link', rel='import')
    return [t['href'] for t in tags]

  @property
  def stylesheets(self):
    tags = self._soup.findAll('link', rel='stylesheet')
    return [t['href'] for t in tags]

  @property
  def inline_stylesheets(self):
    tags = self._soup.findAll('style')
    return [unicode(t.string) for t in tags]

  def YieldHTMLInPieces(self, controller, minify=False):
    yield self.GenerateHTML(controller, minify)

  def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

  @property
  def html_contents_without_links_and_script(self):
    return self.GenerateHTML(
        html_generation_controller.HTMLGenerationController())


class _Tag(object):

  def __init__(self, tag, attrs):
    self.tag = tag
    self.attrs = attrs

  def __repr__(self):
    attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
    return '<%s %s>' % (self.tag, attr_string)


class HTMLModuleParser():

  def Parse(self, html):
    if html is None:
      html = ''
    else:
      if html.find('< /script>') != -1:
        raise Exception('Escape script tags with <\/script>')

    return HTMLModuleParserResults(html)