Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env python
      2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 
      7 """Extracts registration forms from the corresponding HTML files.
      8 
      9 Used for extracting forms within HTML files. This script is used in
     10 conjunction with the webforms_aggregator.py script, which aggregates web pages
     11 with fillable forms (i.e registration forms).
     12 
     13 The purpose of this script is to extract out all non-form elements that may be
     14 causing parsing errors and timeout issues when running browser_tests.
     15 
     16 This script extracts all forms from a HTML file.
     17 If there are multiple forms per downloaded site, multiple files are created
     18 for each form.
     19 
     20 Used as a standalone script but assumes that it is run from the directory in
     21 which it is checked into.
     22 
     23 Usage: forms_extractor.py [options]
     24 
     25 Options:
     26   -l LOG_LEVEL, --log_level=LOG_LEVEL,
     27     LOG_LEVEL: debug, info, warning or error [default: error]
     28   -j, --js  extracts javascript elements from web form.
     29   -h, --help  show this help message and exit
     30 """
     31 
     32 import glob
     33 import logging
     34 from optparse import OptionParser
     35 import os
     36 import re
     37 import sys
     38 
     39 
     40 class FormsExtractor(object):
     41   """Extracts HTML files, leaving only registration forms from the HTML file."""
     42   _HTML_FILES_PATTERN = r'*.html'
     43   _HTML_FILE_PREFIX = r'grabber-'
     44   _FORM_FILE_PREFIX = r'grabber-stripped-'
     45 
     46   _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
     47                                          'heuristics', 'input')
     48   _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
     49                                       'heuristics', 'input')
     50 
     51   logger = logging.getLogger(__name__)
     52   log_handlers = {'StreamHandler': None}
     53 
     54   # This pattern is used for retrieving the form location comment located at the
     55   # top of each downloaded HTML file indicating where the form originated from.
     56   _RE_FORM_LOCATION_PATTERN = re.compile(
     57       ur"""
     58       <!--Form\s{1}Location:  # Starting of form location comment.
     59       .*?                     # Any characters (non-greedy).
     60       -->                     # Ending of the form comment.
     61       """, re.U | re.S | re.I | re.X)
     62 
     63   # This pattern is used for removing all script code.
     64   _RE_SCRIPT_PATTERN = re.compile(
     65       ur"""
     66       <script       # A new opening '<script' tag.
     67       \b            # The end of the word 'script'.
     68       .*?           # Any characters (non-greedy).
     69       >             # Ending of the (opening) tag: '>'.
     70       .*?           # Any characters (non-greedy) between the tags.
     71       </script\s*>  # The '</script>' closing tag.
     72       """, re.U | re.S | re.I | re.X)
     73 
     74   # This pattern is used for removing all href js code.
     75   _RE_HREF_JS_PATTERN = re.compile(
     76       ur"""
     77       \bhref             # The word href and its beginning.
     78       \s*=\s*            # The '=' with all whitespace before and after it.
     79       (?P<quote>[\'\"])  # A single or double quote which is captured.
     80       \s*javascript\s*:  # The word 'javascript:' with any whitespace possible.
     81       .*?                # Any characters (non-greedy) between the quotes.
     82       \1                 # The previously captured single or double quote.
     83       """, re.U | re.S | re.I | re.X)
     84 
     85   _RE_EVENT_EXPR = (
     86       ur"""
     87       \b                 # The beginning of a new word.
     88       on\w+?             # All words starting with 'on' (non-greedy)
     89                          # example: |onmouseover|.
     90       \s*=\s*            # The '=' with all whitespace before and after it.
     91       (?P<quote>[\'\"])  # A captured single or double quote.
     92       .*?                # Any characters (non-greedy) between the quotes.
     93       \1                 # The previously captured single or double quote.
     94       """)
     95 
     96   # This pattern is used for removing code with js events, such as |onload|.
     97   # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
     98   # pattern matches to strings such as '<tr class="nav"
     99   # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
    100   _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
    101       ur"""
    102       <        # Matches character '<'.
    103       [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).""" +
    104       _RE_EVENT_EXPR +
    105       ur"""
    106       [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).
    107       >        # Matches character '>'.
    108       """, re.U | re.S | re.I | re.X)
    109 
    110   # Adds whitespace chars at the end of the matched event. Also match trailing
    111   # whitespaces for JS events. Do not match leading whitespace.
    112   # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
    113   # considered valid HTML.
    114   _RE_EVENT_PATTERN = re.compile(
    115       _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
    116 
    117   # This pattern is used for finding form elements.
    118   _RE_FORM_PATTERN = re.compile(
    119       ur"""
    120       <form       # A new opening '<form' tag.
    121       \b          # The end of the word 'form'.
    122       .*?         # Any characters (non-greedy).
    123       >           # Ending of the (opening) tag: '>'.
    124       .*?         # Any characters (non-greedy) between the tags.
    125       </form\s*>  # The '</form>' closing tag.
    126       """, re.U | re.S | re.I | re.X)
    127 
    128   def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
    129                output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
    130     """Creates a FormsExtractor object.
    131 
    132     Args:
    133       input_dir: the directory of HTML files.
    134       output_dir: the directory where the registration form files will be
    135                   saved.
    136       logging_level: verbosity level, default is None.
    137 
    138     Raises:
    139       IOError exception if input directory doesn't exist.
    140     """
    141     if logging_level:
    142       if not self.log_handlers['StreamHandler']:
    143         console = logging.StreamHandler()
    144         console.setLevel(logging.DEBUG)
    145         self.log_handlers['StreamHandler'] = console
    146         self.logger.addHandler(console)
    147       self.logger.setLevel(logging_level)
    148     else:
    149       if self.log_handlers['StreamHandler']:
    150         self.logger.removeHandler(self.log_handlers['StreamHandler'])
    151         self.log_handlers['StreamHandler'] = None
    152 
    153     self._input_dir = input_dir
    154     self._output_dir = output_dir
    155     if not os.path.isdir(self._input_dir):
    156       error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
    157       self.logger.error('Error: %s', error_msg)
    158       raise IOError(error_msg)
    159     if not os.path.isdir(output_dir):
    160       os.makedirs(output_dir)
    161     self._form_location_comment = ''
    162 
    163   def _SubstituteAllEvents(self, matchobj):
    164     """Remove all js events that are present as attributes within a tag.
    165 
    166     Args:
    167       matchobj: A regexp |re.MatchObject| containing text that has at least one
    168                 event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
    169                 onmouseout="mOut1(this);">|.
    170 
    171     Returns:
    172       The text containing the tag with all the attributes except for the tags
    173       with events. Example: |<tr class="nav">|.
    174     """
    175     tag_with_all_attrs = matchobj.group(0)
    176     return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
    177 
    178   def Extract(self, strip_js_only):
    179     """Extracts and saves the extracted registration forms.
    180 
    181     Iterates through all the HTML files.
    182 
    183     Args:
    184       strip_js_only: If True, only Javascript is stripped from the HTML content.
    185                      Otherwise, all non-form elements are stripped.
    186     """
    187     pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
    188     html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
    189     for filename in html_files:
    190       self.logger.info('Stripping file "%s" ...', filename)
    191       with open(filename, 'U') as f:
    192         html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
    193             self._SubstituteAllEvents,
    194             self._RE_HREF_JS_PATTERN.sub(
    195                 '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
    196 
    197         form_filename = os.path.split(filename)[1]  # Path dropped.
    198         form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
    199         (form_filename, extension) = os.path.splitext(form_filename)
    200         form_filename = (self._FORM_FILE_PREFIX + form_filename +
    201                          '%s' + extension)
    202         form_filename = os.path.join(self._output_dir, form_filename)
    203         if strip_js_only:
    204           form_filename = form_filename % ''
    205           try:
    206             with open(form_filename, 'w') as f:
    207               f.write(html_content)
    208           except IOError as e:
    209             self.logger.error('Error: %s', e)
    210             continue
    211         else:  # Remove all non form elements.
    212           match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
    213           if match:
    214             form_location_comment = match.group() + os.linesep
    215           else:
    216             form_location_comment = ''
    217           forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
    218           for form_number, form_match in enumerate(forms_iterator, start=1):
    219             form_content = form_match.group()
    220             numbered_form_filename = form_filename % form_number
    221             try:
    222               with open(numbered_form_filename, 'w') as f:
    223                 f.write(form_location_comment)
    224                 f.write(form_content)
    225             except IOError as e:
    226               self.logger.error('Error: %s', e)
    227               continue
    228           self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
    229 
    230 
    231 def main():
    232   parser = OptionParser()
    233   parser.add_option(
    234       '-l', '--log_level', metavar='LOG_LEVEL', default='error',
    235       help='LOG_LEVEL: debug, info, warning or error [default: %default]')
    236   parser.add_option(
    237       '-j', '--js', dest='js', action='store_true', default=False,
    238       help='Removes all javascript elements [default: %default]')
    239 
    240   (options, args) = parser.parse_args()
    241   options.log_level = options.log_level.upper()
    242   if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
    243     print 'Wrong log_level argument.'
    244     parser.print_help()
    245     return 1
    246 
    247   options.log_level = getattr(logging, options.log_level)
    248   extractor = FormsExtractor(logging_level=options.log_level)
    249   extractor.Extract(options.js)
    250   return 0
    251 
    252 
    253 if __name__ == '__main__':
    254   sys.exit(main())
    255