1 #!/usr/bin/env python 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 7 """Extracts registration forms from the corresponding HTML files. 8 9 Used for extracting forms within HTML files. This script is used in 10 conjunction with the webforms_aggregator.py script, which aggregates web pages 11 with fillable forms (i.e registration forms). 12 13 The purpose of this script is to extract out all non-form elements that may be 14 causing parsing errors and timeout issues when running browser_tests. 15 16 This script extracts all forms from a HTML file. 17 If there are multiple forms per downloaded site, multiple files are created 18 for each form. 19 20 Used as a standalone script but assumes that it is run from the directory in 21 which it is checked into. 22 23 Usage: forms_extractor.py [options] 24 25 Options: 26 -l LOG_LEVEL, --log_level=LOG_LEVEL, 27 LOG_LEVEL: debug, info, warning or error [default: error] 28 -j, --js extracts javascript elements from web form. 29 -h, --help show this help message and exit 30 """ 31 32 import glob 33 import logging 34 from optparse import OptionParser 35 import os 36 import re 37 import sys 38 39 40 class FormsExtractor(object): 41 """Extracts HTML files, leaving only registration forms from the HTML file.""" 42 _HTML_FILES_PATTERN = r'*.html' 43 _HTML_FILE_PREFIX = r'grabber-' 44 _FORM_FILE_PREFIX = r'grabber-stripped-' 45 46 _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 47 'heuristics', 'input') 48 _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 49 'heuristics', 'input') 50 51 logger = logging.getLogger(__name__) 52 log_handlers = {'StreamHandler': None} 53 54 # This pattern is used for retrieving the form location comment located at the 55 # top of each downloaded HTML file indicating where the form originated from. 56 _RE_FORM_LOCATION_PATTERN = re.compile( 57 ur""" 58 <!--Form\s{1}Location: # Starting of form location comment. 59 .*? # Any characters (non-greedy). 60 --> # Ending of the form comment. 61 """, re.U | re.S | re.I | re.X) 62 63 # This pattern is used for removing all script code. 64 _RE_SCRIPT_PATTERN = re.compile( 65 ur""" 66 <script # A new opening '<script' tag. 67 \b # The end of the word 'script'. 68 .*? # Any characters (non-greedy). 69 > # Ending of the (opening) tag: '>'. 70 .*? # Any characters (non-greedy) between the tags. 71 </script\s*> # The '</script>' closing tag. 72 """, re.U | re.S | re.I | re.X) 73 74 # This pattern is used for removing all href js code. 75 _RE_HREF_JS_PATTERN = re.compile( 76 ur""" 77 \bhref # The word href and its beginning. 78 \s*=\s* # The '=' with all whitespace before and after it. 79 (?P<quote>[\'\"]) # A single or double quote which is captured. 80 \s*javascript\s*: # The word 'javascript:' with any whitespace possible. 81 .*? # Any characters (non-greedy) between the quotes. 82 \1 # The previously captured single or double quote. 83 """, re.U | re.S | re.I | re.X) 84 85 _RE_EVENT_EXPR = ( 86 ur""" 87 \b # The beginning of a new word. 88 on\w+? # All words starting with 'on' (non-greedy) 89 # example: |onmouseover|. 90 \s*=\s* # The '=' with all whitespace before and after it. 91 (?P<quote>[\'\"]) # A captured single or double quote. 92 .*? # Any characters (non-greedy) between the quotes. 93 \1 # The previously captured single or double quote. 94 """) 95 96 # This pattern is used for removing code with js events, such as |onload|. 97 # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the 98 # pattern matches to strings such as '<tr class="nav" 99 # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">' 100 _RE_TAG_WITH_EVENTS_PATTERN = re.compile( 101 ur""" 102 < # Matches character '<'. 103 [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" + 104 _RE_EVENT_EXPR + 105 ur""" 106 [^<>]*? # Matches any characters except '<' and '>' (non-greedy). 107 > # Matches character '>'. 108 """, re.U | re.S | re.I | re.X) 109 110 # Adds whitespace chars at the end of the matched event. Also match trailing 111 # whitespaces for JS events. Do not match leading whitespace. 112 # For example: |< /form>| is invalid HTML and does not exist but |</form >| is 113 # considered valid HTML. 114 _RE_EVENT_PATTERN = re.compile( 115 _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X) 116 117 # This pattern is used for finding form elements. 118 _RE_FORM_PATTERN = re.compile( 119 ur""" 120 <form # A new opening '<form' tag. 121 \b # The end of the word 'form'. 122 .*? # Any characters (non-greedy). 123 > # Ending of the (opening) tag: '>'. 124 .*? # Any characters (non-greedy) between the tags. 125 </form\s*> # The '</form>' closing tag. 126 """, re.U | re.S | re.I | re.X) 127 128 def __init__(self, input_dir=_REGISTRATION_PAGES_DIR, 129 output_dir=_EXTRACTED_FORMS_DIR, logging_level=None): 130 """Creates a FormsExtractor object. 131 132 Args: 133 input_dir: the directory of HTML files. 134 output_dir: the directory where the registration form files will be 135 saved. 136 logging_level: verbosity level, default is None. 137 138 Raises: 139 IOError exception if input directory doesn't exist. 140 """ 141 if logging_level: 142 if not self.log_handlers['StreamHandler']: 143 console = logging.StreamHandler() 144 console.setLevel(logging.DEBUG) 145 self.log_handlers['StreamHandler'] = console 146 self.logger.addHandler(console) 147 self.logger.setLevel(logging_level) 148 else: 149 if self.log_handlers['StreamHandler']: 150 self.logger.removeHandler(self.log_handlers['StreamHandler']) 151 self.log_handlers['StreamHandler'] = None 152 153 self._input_dir = input_dir 154 self._output_dir = output_dir 155 if not os.path.isdir(self._input_dir): 156 error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir 157 self.logger.error('Error: %s', error_msg) 158 raise IOError(error_msg) 159 if not os.path.isdir(output_dir): 160 os.makedirs(output_dir) 161 self._form_location_comment = '' 162 163 def _SubstituteAllEvents(self, matchobj): 164 """Remove all js events that are present as attributes within a tag. 165 166 Args: 167 matchobj: A regexp |re.MatchObject| containing text that has at least one 168 event. Example: |<tr class="nav" onmouseover="mOvr1(this);" 169 onmouseout="mOut1(this);">|. 170 171 Returns: 172 The text containing the tag with all the attributes except for the tags 173 with events. Example: |<tr class="nav">|. 174 """ 175 tag_with_all_attrs = matchobj.group(0) 176 return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs) 177 178 def Extract(self, strip_js_only): 179 """Extracts and saves the extracted registration forms. 180 181 Iterates through all the HTML files. 182 183 Args: 184 strip_js_only: If True, only Javascript is stripped from the HTML content. 185 Otherwise, all non-form elements are stripped. 186 """ 187 pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN) 188 html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)] 189 for filename in html_files: 190 self.logger.info('Stripping file "%s" ...', filename) 191 with open(filename, 'U') as f: 192 html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub( 193 self._SubstituteAllEvents, 194 self._RE_HREF_JS_PATTERN.sub( 195 '', self._RE_SCRIPT_PATTERN.sub('', f.read()))) 196 197 form_filename = os.path.split(filename)[1] # Path dropped. 198 form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1) 199 (form_filename, extension) = os.path.splitext(form_filename) 200 form_filename = (self._FORM_FILE_PREFIX + form_filename + 201 '%s' + extension) 202 form_filename = os.path.join(self._output_dir, form_filename) 203 if strip_js_only: 204 form_filename = form_filename % '' 205 try: 206 with open(form_filename, 'w') as f: 207 f.write(html_content) 208 except IOError as e: 209 self.logger.error('Error: %s', e) 210 continue 211 else: # Remove all non form elements. 212 match = self._RE_FORM_LOCATION_PATTERN.search(html_content) 213 if match: 214 form_location_comment = match.group() + os.linesep 215 else: 216 form_location_comment = '' 217 forms_iterator = self._RE_FORM_PATTERN.finditer(html_content) 218 for form_number, form_match in enumerate(forms_iterator, start=1): 219 form_content = form_match.group() 220 numbered_form_filename = form_filename % form_number 221 try: 222 with open(numbered_form_filename, 'w') as f: 223 f.write(form_location_comment) 224 f.write(form_content) 225 except IOError as e: 226 self.logger.error('Error: %s', e) 227 continue 228 self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename) 229 230 231 def main(): 232 parser = OptionParser() 233 parser.add_option( 234 '-l', '--log_level', metavar='LOG_LEVEL', default='error', 235 help='LOG_LEVEL: debug, info, warning or error [default: %default]') 236 parser.add_option( 237 '-j', '--js', dest='js', action='store_true', default=False, 238 help='Removes all javascript elements [default: %default]') 239 240 (options, args) = parser.parse_args() 241 options.log_level = options.log_level.upper() 242 if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']: 243 print 'Wrong log_level argument.' 244 parser.print_help() 245 return 1 246 247 options.log_level = getattr(logging, options.log_level) 248 extractor = FormsExtractor(logging_level=options.log_level) 249 extractor.Extract(options.js) 250 return 0 251 252 253 if __name__ == '__main__': 254 sys.exit(main()) 255