1 #!/usr/bin/env python 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 import logging 7 import os 8 import subprocess 9 import tempfile 10 import unittest 11 12 # Same name as the aggregator module name. 13 import webforms_aggregator 14 15 logger = logging.getLogger(webforms_aggregator.__name__) 16 console = logging.StreamHandler() 17 logger.addHandler(console) 18 19 # Commenting out the following line will set logger level to default: WARNING 20 logger.setLevel(logging.INFO) 21 22 23 class WebformsAggregatorTest(unittest.TestCase): 24 """Unit tests for the webforms_aggregator module.""" 25 PORT1 = 8002 26 PORT2 = 8003 27 28 HOME_CONTENT = """ 29 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \ 30 "http://www.w3.org/TR/html4/loose.dtd"> 31 <html> 32 <head> 33 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 34 <title>%s</title> 35 </head> 36 <body> 37 <h1>%s</h1> 38 <p>This is a mock site. Its mere purpose is to contribute towards testing \ 39 the aggregator crawler.</p> 40 <ul> 41 <li><a href="%s">page1</a></li> 42 <li><a href="%s">page2</a></li> 43 <li><a href="%s">page3</a></li> 44 </ul> 45 <hr> 46 <p> 47 <a href="%s">sign in</a> 48 </p> 49 </body> 50 </html> 51 """ 52 53 SIMPLE_PAGE_CONTENT = """ 54 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \ 55 "http://www.w3.org/TR/html4/loose.dtd"> 56 <html> 57 <head> 58 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 59 <title>%s</title> 60 </head> 61 <body> 62 <h1>%s</h1> 63 <p>%s</p> 64 <ul> 65 <li><a href="%s">%s</a></li> 66 <li><a href="%s">%s</a></li> 67 </ul> 68 <hr> 69 <p> 70 <a href="%s">return to home page</a> 71 </p> 72 </body> 73 </html> 74 """ 75 76 SIGNIN_CONTENT = """ 77 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \ 78 "http://www.w3.org/TR/html4/loose.dtd"> 79 <html> 80 <head> 81 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 82 <title>%s</title> 83 </head> 84 <body> 85 <h1>Sign in!</h1> 86 <h3>%s</h3> 87 <form> 88 <label>User name: </label><input type="text"><br><br> 89 <label>password: </label><input type="password"><br><br> 90 <input type="submit" value="Sign in"> 91 </form> 92 <hr> 93 <p><a href="%s">return to home page</a></p> 94 </body> 95 </html> 96 """ 97 98 REG_CONTENT = """ 99 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \ 100 "http://www.w3.org/TR/html4/loose.dtd"> 101 <html> 102 <head> 103 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> 104 <title>%s</title> 105 </head> 106 <body> 107 <h1>Create a user account!</h1> 108 109 <h3>Enter your data below:</h3> 110 <form method="get"> 111 <label>First name: </label><input type="text"><br><br> 112 <label>Surname: </label><input type="text"><br><br> 113 <label>User name: </label><input type="text"><br><br> 114 <label>password: </label><input type="password"><br><br> 115 <label>retype password: </label><input type="password"><br><br> 116 <input type="submit" value="Register"> 117 </form> 118 <hr> 119 <p><a href="%s">return to home page</a></p> 120 </body> 121 </html> 122 """ 123 124 def CreateMockSiteOne(self): 125 """Site One has a registration form. 126 """ 127 self.files['site1_home'] = 'site1_index.html' 128 self.files['site1_page1'] = 'site1_page1.html' 129 self.files['site1_page2'] = 'site1_page2.html' 130 self.files['site1_page3'] = 'site1_page3.html' 131 self.files['site1_signin'] = 'site1_signin.html' 132 self.files['site1_reg'] = 'site1_register.html' 133 134 file_content = {} 135 file_content[self.files['site1_home']] = self.HOME_CONTENT % ( 136 'Site One home page', 'Welcome to site one. It has a reg page!', 137 self.files['site1_page1'], self.files['site1_page2'], 138 self.files['site1_page3'], self.files['site1_signin']) 139 140 file_content[self.files['site1_page1']] = self.SIMPLE_PAGE_CONTENT % ( 141 'Site One page 1', 142 'Page 1!', 'This is a useless page. It does almost nothing.', 143 self.files['site1_page2'], 'page 2', self.files['site1_page3'], 144 'page 3', self.files['site1_home']) 145 146 file_content[self.files['site1_page2']] = self.SIMPLE_PAGE_CONTENT % ( 147 'Site One page 2', 'Page 2!', 148 'This is another useless page. It does almost what the page 1 does.', 149 self.files['site1_page1'], 'page 1', self.files['site1_page3'], 150 'page 3', self.files['site1_home']) 151 152 file_content[self.files['site1_page3']] = self.SIMPLE_PAGE_CONTENT % ( 153 'Site One page 3', 'Page 3!', 154 "This is the last useless page. It doesn't do anything useful at all.", 155 self.files['site1_page1'], 'page 1', self.files['site1_page2'], 156 'page 2', self.files['site1_home']) 157 158 file_content[self.files['site1_signin']] = self.SIGNIN_CONTENT % ( 159 'Site One signin', 160 'If you don\'t have a user account click <a href="%s">here</a>.' \ 161 % self.files['site1_reg'], 162 self.files['site1_home']) 163 164 file_content[self.files['site1_reg']] = self.REG_CONTENT % ( 165 'Site One signin', self.files['site1_home']) 166 167 for filename, content in file_content.iteritems(): 168 f = open(filename, 'w') 169 try: 170 f.write(content) 171 finally: 172 f.close() 173 174 def CreateMockSiteTwo(self): 175 """ Site Two has no registration page.""" 176 177 self.files['site2_home'] = 'site2_index.html' 178 self.files['site2_page1'] = 'site2_page1.html' 179 self.files['site2_page2'] = 'site2_page2.html' 180 self.files['site2_page3'] = 'site2_page3.html' 181 self.files['site2_signin'] = 'site2_signin.html' 182 183 file_content = {} 184 file_content[self.files['site2_home']] = self.HOME_CONTENT % ( 185 'Site Two home page', 'Welcome to site two. It has no reg page!', 186 self.files['site2_page1'], self.files['site2_page2'], 187 self.files['site2_page3'], self.files['site2_signin']) 188 189 file_content[self.files['site2_page1']] = self.SIMPLE_PAGE_CONTENT % ( 190 'Site Two page 1', 191 'Page 1!', 'This is a useless page. It does almost nothing.', 192 self.files['site2_page2'], 'page 2', self.files['site2_page3'], 193 'page 3', self.files['site2_home']) 194 195 file_content[self.files['site2_page2']] = self.SIMPLE_PAGE_CONTENT % ( 196 'Site Two page 2', 'Page 2!', 197 'This is another useless page. It does almost what the page 1 does.', 198 self.files['site2_page1'], 'page 1', self.files['site2_page3'], 199 'page 3', self.files['site2_home']) 200 201 file_content[self.files['site2_page3']] = self.SIMPLE_PAGE_CONTENT % ( 202 'Site Two page 3', 'Page 3!', 203 "This is the last useless page. It doesn't do anything useful at all.", 204 self.files['site2_page1'], 'page 1', self.files['site2_page2'], 205 'page 2', self.files['site2_home']) 206 207 file_content[self.files['site2_signin']] = self.SIGNIN_CONTENT % ( 208 'Site Two signin', 'You cannot register online with this site.', 209 self.files['site2_home']) 210 211 for filename, content in file_content.iteritems(): 212 f = open(filename, 'w') 213 try: 214 f.write(content) 215 finally: 216 f.close() 217 218 def setUp(self): 219 self.cwd = os.getcwdu() 220 self.temp_dir = tempfile.mkdtemp() 221 os.chdir(self.temp_dir) 222 223 self.files = {} 224 225 self.CreateMockSiteOne() 226 self.CreateMockSiteTwo() 227 self.files['cookie'] = 'test.cookie' 228 self.url1 = 'http://localhost:%s/%s' % (self.PORT1, 229 self.files['site1_home']) 230 self.url2 = 'http://localhost:%s/%s' % (self.PORT2, 231 self.files['site2_home']) 232 self.domain1 = 'localhost:%s' %self.PORT1 233 self.files['url'] = 'urls.txt' 234 url_file_handler = open(self.files['url'], 'w') 235 try: 236 url_file_handler.write('URLs to crawl:') 237 url_file_handler.write(os.linesep) 238 for url in (self.url1, self.url2): 239 url_file_handler.write(url) 240 url_file_handler.write(os.linesep) 241 finally: 242 url_file_handler.close() 243 244 command_line = 'python -u -m SimpleHTTPServer %s' % self.PORT1 245 args = command_line.split() 246 self.server1 = subprocess.Popen( 247 args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 248 self.server1.stdout.readline() # Needed in order for the server to start up 249 250 command_line = 'python -u -m SimpleHTTPServer %s' % self.PORT2 251 args = command_line.split() 252 self.server2 = subprocess.Popen( 253 args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 254 self.server2.stdout.readline() # Needed in order for the server to start up 255 256 def tearDown(self): 257 self.server1.terminate() 258 self.server2.terminate() 259 260 for filename in self.files.values(): 261 if os.path.isfile(filename): 262 os.unlink(filename) 263 os.chdir(self.cwd) 264 os.rmdir(self.temp_dir) 265 266 def testRetrieverDownloadsPage(self): 267 """Verify the retriever can download a page.""" 268 r = webforms_aggregator.Retriever(self.url1, self.domain1, 269 self.files['cookie']) 270 self.assertTrue(r.Download(), 271 msg='Retriever could not download "%s"' % self.url1) 272 273 def testCrawlerFindsRegPageFromUrl(self): 274 """Verify that the crawler is able to find a reg page from the given URL.""" 275 c = webforms_aggregator.Crawler(self.url1) 276 self.assertTrue( 277 c.Run(), msg='Crawler could not find the reg page of "%s"' % self.url1) 278 279 def testCrawlerCannotFindNonExistentRegPageFromUrl(self): 280 """Verify that the crawler won't find a non existent reg page 281 from the given URL.""" 282 c = webforms_aggregator.Crawler(self.url2) 283 self.assertFalse( 284 c.Run(), 285 msg='Crawler found a non existent reg page of "%s"' % self.url1) 286 287 def testThreadedCrawlerFindsRegPageFromUrlsFile(self): 288 """Verify the threaded crawler finds reg page from a file of URLs.""" 289 c = webforms_aggregator.ThreadedCrawler(self.files['url']) 290 self.assertNotEqual( 291 c.Run(), -1, 292 msg='Threaded crawler could not find the reg page from the URLs file') 293 294 295 if __name__ == '__main__': 296 suite = unittest.TestLoader().loadTestsFromTestCase( 297 WebformsAggregatorTest) 298 unittest.TextTestRunner(verbosity=2).run(suite) 299