Home | History | Annotate | Download | only in format
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Flattens a HTML file by inlining its external resources.
      7 
      8 This is a small script that takes a HTML file, looks for src attributes
      9 and inlines the specified file, producing one HTML file with no external
     10 dependencies. It recursively inlines the included files.
     11 """
     12 
     13 import os
     14 import re
     15 import sys
     16 import base64
     17 import mimetypes
     18 
     19 from grit import lazy_re
     20 from grit import util
     21 
     22 # There is a python bug that makes mimetypes crash if the Windows
     23 # registry contains non-Latin keys ( http://bugs.python.org/issue9291
     24 # ). Initing manually and blocking external mime-type databases will
     25 # prevent that bug and if we add svg manually, it will still give us
     26 # the data we need.
     27 mimetypes.init([])
     28 mimetypes.add_type('image/svg+xml', '.svg')
     29 
     30 DIST_DEFAULT = 'chromium'
     31 DIST_ENV_VAR = 'CHROMIUM_BUILD'
     32 DIST_SUBSTR = '%DISTRIBUTION%'
     33 
     34 # Matches beginning of an "if" block with trailing spaces.
     35 _BEGIN_IF_BLOCK = lazy_re.compile(
     36     '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*')
     37 
     38 # Matches ending of an "if" block with preceding spaces.
     39 _END_IF_BLOCK = lazy_re.compile('\s*</if>')
     40 
     41 # Used by DoInline to replace various links with inline content.
     42 _STYLESHEET_RE = lazy_re.compile(
     43     '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?',
     44     re.DOTALL)
     45 _INCLUDE_RE = lazy_re.compile(
     46     '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?',
     47     re.DOTALL)
     48 _SRC_RE = lazy_re.compile(
     49     r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?P<filename>[^"\']*)\1',
     50     re.MULTILINE)
     51 _ICON_RE = lazy_re.compile(
     52     r'<link rel="icon"\s(?:[^>]+?\s)?'
     53     'href=(?P<quote>")(?P<filename>[^"\']*)\1',
     54     re.MULTILINE)
     55 
     56 
     57 def GetDistribution():
     58   """Helper function that gets the distribution we are building.
     59 
     60   Returns:
     61     string
     62   """
     63   distribution = DIST_DEFAULT
     64   if DIST_ENV_VAR in os.environ.keys():
     65     distribution = os.environ[DIST_ENV_VAR]
     66     if len(distribution) > 1 and distribution[0] == '_':
     67       distribution = distribution[1:].lower()
     68   return distribution
     69 
     70 
     71 def SrcInlineAsDataURL(
     72     src_match, base_path, distribution, inlined_files, names_only=False,
     73     filename_expansion_function=None):
     74   """regex replace function.
     75 
     76   Takes a regex match for src="filename", attempts to read the file
     77   at 'filename' and returns the src attribute with the file inlined
     78   as a data URI. If it finds DIST_SUBSTR string in file name, replaces
     79   it with distribution.
     80 
     81   Args:
     82     src_match: regex match object with 'filename' and 'quote' named capturing
     83                groups
     84     base_path: path that to look for files in
     85     distribution: string that should replace DIST_SUBSTR
     86     inlined_files: The name of the opened file is appended to this list.
     87     names_only: If true, the function will not read the file but just return "".
     88                 It will still add the filename to |inlined_files|.
     89 
     90   Returns:
     91     string
     92   """
     93   filename = src_match.group('filename')
     94   if filename_expansion_function:
     95     filename = filename_expansion_function(filename)
     96   quote = src_match.group('quote')
     97 
     98   if filename.find(':') != -1:
     99     # filename is probably a URL, which we don't want to bother inlining
    100     return src_match.group(0)
    101 
    102   filename = filename.replace(DIST_SUBSTR , distribution)
    103   filepath = os.path.normpath(os.path.join(base_path, filename))
    104   inlined_files.add(filepath)
    105 
    106   if names_only:
    107     return ""
    108 
    109   mimetype = mimetypes.guess_type(filename)[0]
    110   if mimetype is None:
    111     raise Exception('%s is of an an unknown type and '
    112                     'cannot be stored in a data url.' % filename)
    113   inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY))
    114 
    115   prefix = src_match.string[src_match.start():src_match.start('filename')]
    116   suffix = src_match.string[src_match.end('filename'):src_match.end()]
    117   return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix)
    118 
    119 
    120 class InlinedData:
    121   """Helper class holding the results from DoInline().
    122 
    123   Holds the inlined data and the set of filenames of all the inlined
    124   files.
    125   """
    126   def __init__(self, inlined_data, inlined_files):
    127     self.inlined_data = inlined_data
    128     self.inlined_files = inlined_files
    129 
    130 def DoInline(
    131     input_filename, grd_node, allow_external_script=False, names_only=False,
    132     rewrite_function=None, filename_expansion_function=None):
    133   """Helper function that inlines the resources in a specified file.
    134 
    135   Reads input_filename, finds all the src attributes and attempts to
    136   inline the files they are referring to, then returns the result and
    137   the set of inlined files.
    138 
    139   Args:
    140     input_filename: name of file to read in
    141     grd_node: html node from the grd file for this include tag
    142     names_only: |nil| will be returned for the inlined contents (faster).
    143     rewrite_function: function(filepath, text, distribution) which will be
    144         called to rewrite html content before inlining images.
    145     filename_expansion_function: function(filename) which will be called to
    146         rewrite filenames before attempting to read them.
    147   Returns:
    148     a tuple of the inlined data as a string and the set of filenames
    149     of all the inlined files
    150   """
    151   if filename_expansion_function:
    152     input_filename = filename_expansion_function(input_filename)
    153   input_filepath = os.path.dirname(input_filename)
    154   distribution = GetDistribution()
    155 
    156   # Keep track of all the files we inline.
    157   inlined_files = set()
    158 
    159   def SrcReplace(src_match, filepath=input_filepath,
    160                  inlined_files=inlined_files):
    161     """Helper function to provide SrcInlineAsDataURL with the base file path"""
    162     return SrcInlineAsDataURL(
    163         src_match, filepath, distribution, inlined_files, names_only=names_only,
    164         filename_expansion_function=filename_expansion_function)
    165 
    166   def GetFilepath(src_match, base_path = input_filepath):
    167     filename = src_match.group('filename')
    168 
    169     if filename.find(':') != -1:
    170       # filename is probably a URL, which we don't want to bother inlining
    171       return None
    172 
    173     filename = filename.replace('%DISTRIBUTION%', distribution)
    174     if filename_expansion_function:
    175       filename = filename_expansion_function(filename)
    176     return os.path.normpath(os.path.join(base_path, filename))
    177 
    178   def IsConditionSatisfied(src_match):
    179     expression = src_match.group('expression')
    180     return grd_node is None or grd_node.EvaluateCondition(expression)
    181 
    182   def CheckConditionalElements(str):
    183     """Helper function to conditionally inline inner elements"""
    184     while True:
    185       begin_if = _BEGIN_IF_BLOCK.search(str)
    186       if begin_if is None:
    187         return str
    188 
    189       condition_satisfied = IsConditionSatisfied(begin_if)
    190       leading = str[0:begin_if.start()]
    191       content_start = begin_if.end()
    192 
    193       # Find matching "if" block end.
    194       count = 1
    195       pos = begin_if.end()
    196       while True:
    197         end_if = _END_IF_BLOCK.search(str, pos)
    198         if end_if is None:
    199           raise Exception('Unmatched <if>')
    200 
    201         next_if = _BEGIN_IF_BLOCK.search(str, pos)
    202         if next_if is None or next_if.start() >= end_if.end():
    203           count = count - 1
    204           if count == 0:
    205             break
    206           pos = end_if.end()
    207         else:
    208           count = count + 1
    209           pos = next_if.end()
    210 
    211       content = str[content_start:end_if.start()]
    212       trailing = str[end_if.end():]
    213 
    214       if condition_satisfied:
    215         str = leading + CheckConditionalElements(content) + trailing
    216       else:
    217         str = leading + trailing
    218 
    219   def InlineFileContents(src_match, pattern, inlined_files=inlined_files):
    220     """Helper function to inline external files of various types"""
    221     filepath = GetFilepath(src_match)
    222     if filepath is None:
    223       return src_match.group(0)
    224     inlined_files.add(filepath)
    225 
    226     if names_only:
    227       inlined_files.update(GetResourceFilenames(
    228           filepath,
    229           allow_external_script,
    230           rewrite_function,
    231           filename_expansion_function=filename_expansion_function))
    232       return ""
    233 
    234     return pattern % InlineToString(
    235         filepath, grd_node, allow_external_script,
    236         filename_expansion_function=filename_expansion_function)
    237 
    238   def InlineIncludeFiles(src_match):
    239     """Helper function to directly inline generic external files (without
    240        wrapping them with any kind of tags).
    241     """
    242     return InlineFileContents(src_match, '%s')
    243 
    244   def InlineScript(match):
    245     """Helper function to inline external script files"""
    246     attrs = (match.group('attrs1') + match.group('attrs2')).strip()
    247     if attrs:
    248        attrs = ' ' + attrs
    249     return InlineFileContents(match, '<script' + attrs + '>%s</script>')
    250 
    251   def InlineCSSText(text, css_filepath):
    252     """Helper function that inlines external resources in CSS text"""
    253     filepath = os.path.dirname(css_filepath)
    254     # Allow custom modifications before inlining images.
    255     if rewrite_function:
    256       text = rewrite_function(filepath, text, distribution)
    257     text = InlineCSSImages(text, filepath)
    258     return InlineCSSImports(text, filepath)
    259 
    260   def InlineCSSFile(src_match, pattern, base_path=input_filepath):
    261     """Helper function to inline external CSS files.
    262 
    263     Args:
    264       src_match: A regular expression match with a named group named "filename".
    265       pattern: The pattern to replace with the contents of the CSS file.
    266       base_path: The base path to use for resolving the CSS file.
    267 
    268     Returns:
    269       The text that should replace the reference to the CSS file.
    270     """
    271     filepath = GetFilepath(src_match, base_path)
    272     if filepath is None:
    273       return src_match.group(0)
    274 
    275     # Even if names_only is set, the CSS file needs to be opened, because it
    276     # can link to images that need to be added to the file set.
    277     inlined_files.add(filepath)
    278     # When resolving CSS files we need to pass in the path so that relative URLs
    279     # can be resolved.
    280     return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY),
    281                                    filepath)
    282 
    283   def InlineCSSImages(text, filepath=input_filepath):
    284     """Helper function that inlines external images in CSS backgrounds."""
    285     # Replace contents of url() for css attributes: content, background,
    286     # or *-image.
    287     return re.sub('(content|background|[\w-]*-image):[^;]*' +
    288                   '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' +
    289                       'image-set\(' +
    290                           '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' +
    291                               '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))',
    292                   lambda m: InlineCSSUrls(m, filepath),
    293                   text)
    294 
    295   def InlineCSSUrls(src_match, filepath=input_filepath):
    296     """Helper function that inlines each url on a CSS image rule match."""
    297     # Replace contents of url() references in matches.
    298     return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)',
    299                   lambda m: SrcReplace(m, filepath),
    300                   src_match.group(0))
    301 
    302   def InlineCSSImports(text, filepath=input_filepath):
    303     """Helper function that inlines CSS files included via the @import
    304        directive.
    305     """
    306     return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' +
    307                   '(?P=quote)\);',
    308                   lambda m: InlineCSSFile(m, '%s', filepath),
    309                   text)
    310 
    311 
    312   flat_text = util.ReadFile(input_filename, util.BINARY)
    313 
    314   # Check conditional elements, remove unsatisfied ones from the file. We do
    315   # this twice. The first pass is so that we don't even bother calling
    316   # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually
    317   # going to throw out anyway.
    318   flat_text = CheckConditionalElements(flat_text)
    319 
    320   if not allow_external_script:
    321     # We need to inline css and js before we inline images so that image
    322     # references gets inlined in the css and js
    323     flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' +
    324                        '(?P<attrs2>.*?)></script>',
    325                        InlineScript,
    326                        flat_text)
    327 
    328   flat_text = _STYLESHEET_RE.sub(
    329       lambda m: InlineCSSFile(m, '<style>%s</style>'),
    330       flat_text)
    331 
    332   flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text)
    333 
    334   # Check conditional elements, second pass. This catches conditionals in any
    335   # of the text we just inlined.
    336   flat_text = CheckConditionalElements(flat_text)
    337 
    338   # Allow custom modifications before inlining images.
    339   if rewrite_function:
    340     flat_text = rewrite_function(input_filepath, flat_text, distribution)
    341 
    342   flat_text = _SRC_RE.sub(SrcReplace, flat_text)
    343 
    344   # TODO(arv): Only do this inside <style> tags.
    345   flat_text = InlineCSSImages(flat_text)
    346 
    347   flat_text = _ICON_RE.sub(SrcReplace, flat_text)
    348 
    349   if names_only:
    350     flat_text = None  # Will contains garbage if the flag is set anyway.
    351   return InlinedData(flat_text, inlined_files)
    352 
    353 
    354 def InlineToString(input_filename, grd_node, allow_external_script=False,
    355                    rewrite_function=None, filename_expansion_function=None):
    356   """Inlines the resources in a specified file and returns it as a string.
    357 
    358   Args:
    359     input_filename: name of file to read in
    360     grd_node: html node from the grd file for this include tag
    361   Returns:
    362     the inlined data as a string
    363   """
    364   try:
    365     return DoInline(
    366         input_filename,
    367         grd_node,
    368         allow_external_script=allow_external_script,
    369         rewrite_function=rewrite_function,
    370         filename_expansion_function=filename_expansion_function).inlined_data
    371   except IOError, e:
    372     raise Exception("Failed to open %s while trying to flatten %s. (%s)" %
    373                     (e.filename, input_filename, e.strerror))
    374 
    375 
    376 def InlineToFile(input_filename, output_filename, grd_node):
    377   """Inlines the resources in a specified file and writes it.
    378 
    379   Reads input_filename, finds all the src attributes and attempts to
    380   inline the files they are referring to, then writes the result
    381   to output_filename.
    382 
    383   Args:
    384     input_filename: name of file to read in
    385     output_filename: name of file to be written to
    386     grd_node: html node from the grd file for this include tag
    387   Returns:
    388     a set of filenames of all the inlined files
    389   """
    390   inlined_data = InlineToString(input_filename, grd_node)
    391   with open(output_filename, 'wb') as out_file:
    392     out_file.writelines(inlined_data)
    393 
    394 
    395 def GetResourceFilenames(filename,
    396                          allow_external_script=False,
    397                          rewrite_function=None,
    398                          filename_expansion_function=None):
    399   """For a grd file, returns a set of all the files that would be inline."""
    400   try:
    401     return DoInline(
    402         filename,
    403         None,
    404         names_only=True,
    405         allow_external_script=allow_external_script,
    406         rewrite_function=rewrite_function,
    407         filename_expansion_function=filename_expansion_function).inlined_files
    408   except IOError, e:
    409     raise Exception("Failed to open %s while trying to flatten %s. (%s)" %
    410                     (e.filename, filename, e.strerror))
    411 
    412 
    413 def main():
    414   if len(sys.argv) <= 2:
    415     print "Flattens a HTML file by inlining its external resources.\n"
    416     print "html_inline.py inputfile outputfile"
    417   else:
    418     InlineToFile(sys.argv[1], sys.argv[2], None)
    419 
    420 if __name__ == '__main__':
    421   main()
    422