Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env python
      2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 import logging
      7 import os
      8 import subprocess
      9 import tempfile
     10 import unittest
     11 
     12 # Same name as the aggregator module name.
     13 import webforms_aggregator
     14 
     15 logger = logging.getLogger(webforms_aggregator.__name__)
     16 console = logging.StreamHandler()
     17 logger.addHandler(console)
     18 
     19 # Commenting out the following line will set logger level to default: WARNING
     20 logger.setLevel(logging.INFO)
     21 
     22 
     23 class WebformsAggregatorTest(unittest.TestCase):
     24   """Unit tests for the webforms_aggregator module."""
     25   PORT1 = 8002
     26   PORT2 = 8003
     27 
     28   HOME_CONTENT = """
     29     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
     30         "http://www.w3.org/TR/html4/loose.dtd">
     31     <html>
     32     <head>
     33     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
     34     <title>%s</title>
     35     </head>
     36     <body>
     37     <h1>%s</h1>
     38     <p>This is a mock site. Its mere purpose is to contribute towards testing \
     39         the aggregator crawler.</p>
     40     <ul>
     41      <li><a href="%s">page1</a></li>
     42      <li><a href="%s">page2</a></li>
     43      <li><a href="%s">page3</a></li>
     44     </ul>
     45     <hr>
     46     <p>
     47       <a href="%s">sign in</a>
     48     </p>
     49     </body>
     50     </html>
     51   """
     52 
     53   SIMPLE_PAGE_CONTENT = """
     54     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
     55         "http://www.w3.org/TR/html4/loose.dtd">
     56     <html>
     57     <head>
     58     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
     59     <title>%s</title>
     60     </head>
     61     <body>
     62     <h1>%s</h1>
     63     <p>%s</p>
     64     <ul>
     65      <li><a href="%s">%s</a></li>
     66      <li><a href="%s">%s</a></li>
     67     </ul>
     68     <hr>
     69     <p>
     70       <a href="%s">return to home page</a>
     71     </p>
     72     </body>
     73     </html>
     74   """
     75 
     76   SIGNIN_CONTENT = """
     77     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
     78         "http://www.w3.org/TR/html4/loose.dtd">
     79     <html>
     80     <head>
     81     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
     82     <title>%s</title>
     83     </head>
     84     <body>
     85     <h1>Sign in!</h1>
     86     <h3>%s</h3>
     87     <form>
     88       <label>User name: </label><input type="text"><br><br>
     89       <label>password: </label><input type="password"><br><br>
     90       <input type="submit" value="Sign in">
     91     </form>
     92     <hr>
     93     <p><a href="%s">return to home page</a></p>
     94     </body>
     95     </html>
     96   """
     97 
     98   REG_CONTENT = """
     99     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
    100         "http://www.w3.org/TR/html4/loose.dtd">
    101     <html>
    102     <head>
    103     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    104     <title>%s</title>
    105     </head>
    106     <body>
    107     <h1>Create a user account!</h1>
    108 
    109     <h3>Enter your data below:</h3>
    110     <form method="get">
    111       <label>First name: </label><input type="text"><br><br>
    112       <label>Surname: </label><input type="text"><br><br>
    113       <label>User name: </label><input type="text"><br><br>
    114       <label>password: </label><input type="password"><br><br>
    115       <label>retype password: </label><input type="password"><br><br>
    116       <input type="submit" value="Register">
    117     </form>
    118     <hr>
    119     <p><a href="%s">return to home page</a></p>
    120     </body>
    121     </html>
    122   """
    123 
    124   def CreateMockSiteOne(self):
    125     """Site One has a registration form.
    126     """
    127     self.files['site1_home'] = 'site1_index.html'
    128     self.files['site1_page1'] = 'site1_page1.html'
    129     self.files['site1_page2'] = 'site1_page2.html'
    130     self.files['site1_page3'] = 'site1_page3.html'
    131     self.files['site1_signin'] = 'site1_signin.html'
    132     self.files['site1_reg'] = 'site1_register.html'
    133 
    134     file_content = {}
    135     file_content[self.files['site1_home']] = self.HOME_CONTENT % (
    136           'Site One home page', 'Welcome to site one. It has a reg page!',
    137           self.files['site1_page1'], self.files['site1_page2'],
    138           self.files['site1_page3'], self.files['site1_signin'])
    139 
    140     file_content[self.files['site1_page1']] = self.SIMPLE_PAGE_CONTENT % (
    141         'Site One page 1',
    142         'Page 1!', 'This is a useless page. It does almost nothing.',
    143         self.files['site1_page2'], 'page 2', self.files['site1_page3'],
    144         'page 3', self.files['site1_home'])
    145 
    146     file_content[self.files['site1_page2']] = self.SIMPLE_PAGE_CONTENT % (
    147         'Site One page 2', 'Page 2!',
    148         'This is another useless page. It does almost what the page 1 does.',
    149         self.files['site1_page1'], 'page 1', self.files['site1_page3'],
    150         'page 3', self.files['site1_home'])
    151 
    152     file_content[self.files['site1_page3']] = self.SIMPLE_PAGE_CONTENT % (
    153         'Site One page 3', 'Page 3!',
    154         "This is the last useless page. It doesn't do anything useful at all.",
    155         self.files['site1_page1'], 'page 1', self.files['site1_page2'],
    156         'page 2', self.files['site1_home'])
    157 
    158     file_content[self.files['site1_signin']] = self.SIGNIN_CONTENT % (
    159         'Site One signin',
    160         'If you don\'t have a user account click <a href="%s">here</a>.' \
    161             % self.files['site1_reg'],
    162         self.files['site1_home'])
    163 
    164     file_content[self.files['site1_reg']] = self.REG_CONTENT % (
    165         'Site One signin', self.files['site1_home'])
    166 
    167     for filename, content in file_content.iteritems():
    168       f = open(filename, 'w')
    169       try:
    170         f.write(content)
    171       finally:
    172         f.close()
    173 
    174   def CreateMockSiteTwo(self):
    175     """ Site Two has no registration page."""
    176 
    177     self.files['site2_home'] = 'site2_index.html'
    178     self.files['site2_page1'] = 'site2_page1.html'
    179     self.files['site2_page2'] = 'site2_page2.html'
    180     self.files['site2_page3'] = 'site2_page3.html'
    181     self.files['site2_signin'] = 'site2_signin.html'
    182 
    183     file_content = {}
    184     file_content[self.files['site2_home']] = self.HOME_CONTENT % (
    185           'Site Two home page', 'Welcome to site two. It has no reg page!',
    186           self.files['site2_page1'], self.files['site2_page2'],
    187           self.files['site2_page3'], self.files['site2_signin'])
    188 
    189     file_content[self.files['site2_page1']] = self.SIMPLE_PAGE_CONTENT % (
    190         'Site Two page 1',
    191         'Page 1!', 'This is a useless page. It does almost nothing.',
    192         self.files['site2_page2'], 'page 2', self.files['site2_page3'],
    193         'page 3', self.files['site2_home'])
    194 
    195     file_content[self.files['site2_page2']] = self.SIMPLE_PAGE_CONTENT % (
    196         'Site Two page 2', 'Page 2!',
    197         'This is another useless page. It does almost what the page 1 does.',
    198         self.files['site2_page1'], 'page 1', self.files['site2_page3'],
    199         'page 3', self.files['site2_home'])
    200 
    201     file_content[self.files['site2_page3']] = self.SIMPLE_PAGE_CONTENT % (
    202         'Site Two page 3', 'Page 3!',
    203         "This is the last useless page. It doesn't do anything useful at all.",
    204         self.files['site2_page1'], 'page 1', self.files['site2_page2'],
    205         'page 2', self.files['site2_home'])
    206 
    207     file_content[self.files['site2_signin']] = self.SIGNIN_CONTENT % (
    208         'Site Two signin', 'You cannot register online with this site.',
    209         self.files['site2_home'])
    210 
    211     for filename, content in file_content.iteritems():
    212       f = open(filename, 'w')
    213       try:
    214         f.write(content)
    215       finally:
    216         f.close()
    217 
    218   def setUp(self):
    219     self.cwd = os.getcwdu()
    220     self.temp_dir = tempfile.mkdtemp()
    221     os.chdir(self.temp_dir)
    222 
    223     self.files = {}
    224 
    225     self.CreateMockSiteOne()
    226     self.CreateMockSiteTwo()
    227     self.files['cookie'] = 'test.cookie'
    228     self.url1 = 'http://localhost:%s/%s' % (self.PORT1,
    229                                             self.files['site1_home'])
    230     self.url2 = 'http://localhost:%s/%s' % (self.PORT2,
    231                                             self.files['site2_home'])
    232     self.domain1 = 'localhost:%s' %self.PORT1
    233     self.files['url'] = 'urls.txt'
    234     url_file_handler = open(self.files['url'], 'w')
    235     try:
    236       url_file_handler.write('URLs to crawl:')
    237       url_file_handler.write(os.linesep)
    238       for url in (self.url1, self.url2):
    239         url_file_handler.write(url)
    240         url_file_handler.write(os.linesep)
    241     finally:
    242       url_file_handler.close()
    243 
    244     command_line = 'python -u -m SimpleHTTPServer %s' % self.PORT1
    245     args = command_line.split()
    246     self.server1 = subprocess.Popen(
    247         args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    248     self.server1.stdout.readline()  # Needed in order for the server to start up
    249 
    250     command_line = 'python -u -m SimpleHTTPServer %s' % self.PORT2
    251     args = command_line.split()
    252     self.server2 = subprocess.Popen(
    253         args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    254     self.server2.stdout.readline()  # Needed in order for the server to start up
    255 
    256   def tearDown(self):
    257     self.server1.terminate()
    258     self.server2.terminate()
    259 
    260     for filename in self.files.values():
    261       if os.path.isfile(filename):
    262         os.unlink(filename)
    263     os.chdir(self.cwd)
    264     os.rmdir(self.temp_dir)
    265 
    266   def testRetrieverDownloadsPage(self):
    267     """Verify the retriever can download a page."""
    268     r = webforms_aggregator.Retriever(self.url1, self.domain1,
    269                                       self.files['cookie'])
    270     self.assertTrue(r.Download(),
    271                 msg='Retriever could not download "%s"' % self.url1)
    272 
    273   def testCrawlerFindsRegPageFromUrl(self):
    274     """Verify that the crawler is able to find a reg page from the given URL."""
    275     c = webforms_aggregator.Crawler(self.url1)
    276     self.assertTrue(
    277         c.Run(), msg='Crawler could not find the reg page of "%s"' % self.url1)
    278 
    279   def testCrawlerCannotFindNonExistentRegPageFromUrl(self):
    280     """Verify that the crawler won't find a non existent reg page
    281     from the given URL."""
    282     c = webforms_aggregator.Crawler(self.url2)
    283     self.assertFalse(
    284         c.Run(),
    285         msg='Crawler found a non existent reg page of "%s"' % self.url1)
    286 
    287   def testThreadedCrawlerFindsRegPageFromUrlsFile(self):
    288     """Verify the threaded crawler finds reg page from a file of URLs."""
    289     c = webforms_aggregator.ThreadedCrawler(self.files['url'])
    290     self.assertNotEqual(
    291         c.Run(), -1,
    292         msg='Threaded crawler could not find the reg page from the URLs file')
    293 
    294 
    295 if __name__ == '__main__':
    296   suite = unittest.TestLoader().loadTestsFromTestCase(
    297       WebformsAggregatorTest)
    298   unittest.TextTestRunner(verbosity=2).run(suite)
    299