Home | History | Annotate | Download | only in test
      1 import io
      2 import os
      3 import unittest
      4 import urllib.robotparser
      5 from collections import namedtuple
      6 from test import support
      7 from http.server import BaseHTTPRequestHandler, HTTPServer
      8 try:
      9     import threading
     10 except ImportError:
     11     threading = None
     12 
     13 
     14 class BaseRobotTest:
     15     robots_txt = ''
     16     agent = 'test_robotparser'
     17     good = []
     18     bad = []
     19 
     20     def setUp(self):
     21         lines = io.StringIO(self.robots_txt).readlines()
     22         self.parser = urllib.robotparser.RobotFileParser()
     23         self.parser.parse(lines)
     24 
     25     def get_agent_and_url(self, url):
     26         if isinstance(url, tuple):
     27             agent, url = url
     28             return agent, url
     29         return self.agent, url
     30 
     31     def test_good_urls(self):
     32         for url in self.good:
     33             agent, url = self.get_agent_and_url(url)
     34             with self.subTest(url=url, agent=agent):
     35                 self.assertTrue(self.parser.can_fetch(agent, url))
     36 
     37     def test_bad_urls(self):
     38         for url in self.bad:
     39             agent, url = self.get_agent_and_url(url)
     40             with self.subTest(url=url, agent=agent):
     41                 self.assertFalse(self.parser.can_fetch(agent, url))
     42 
     43 
     44 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
     45     robots_txt = """\
     46 User-agent: *
     47 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     48 Disallow: /tmp/ # these will soon disappear
     49 Disallow: /foo.html
     50     """
     51     good = ['/', '/test.html']
     52     bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
     53 
     54 
     55 class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
     56     robots_txt = """\
     57 # robots.txt for http://www.example.com/
     58 
     59 User-agent: *
     60 Crawl-delay: 1
     61 Request-rate: 3/15
     62 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     63 
     64 # Cybermapper knows where to go.
     65 User-agent: cybermapper
     66 Disallow:
     67     """
     68     good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
     69     bad = ['/cyberworld/map/index.html']
     70 
     71 
     72 class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
     73     robots_txt = """\
     74 # go away
     75 User-agent: *
     76 Disallow: /
     77     """
     78     good = []
     79     bad = ['/cyberworld/map/index.html', '/', '/tmp/']
     80 
     81 
     82 class BaseRequestRateTest(BaseRobotTest):
     83 
     84     def test_request_rate(self):
     85         for url in self.good + self.bad:
     86             agent, url = self.get_agent_and_url(url)
     87             with self.subTest(url=url, agent=agent):
     88                 if self.crawl_delay:
     89                     self.assertEqual(
     90                         self.parser.crawl_delay(agent), self.crawl_delay
     91                     )
     92                 if self.request_rate:
     93                     self.assertEqual(
     94                         self.parser.request_rate(agent).requests,
     95                         self.request_rate.requests
     96                     )
     97                     self.assertEqual(
     98                         self.parser.request_rate(agent).seconds,
     99                         self.request_rate.seconds
    100                     )
    101 
    102 
    103 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
    104     robots_txt = """\
    105 User-agent: figtree
    106 Crawl-delay: 3
    107 Request-rate: 9/30
    108 Disallow: /tmp
    109 Disallow: /a%3cd.html
    110 Disallow: /a%2fb.html
    111 Disallow: /%7ejoe/index.html
    112     """
    113     agent = 'figtree'
    114     request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
    115     crawl_delay = 3
    116     good = [('figtree', '/foo.html')]
    117     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
    118            '/a%2fb.html', '/~joe/index.html']
    119 
    120 
    121 class DifferentAgentTest(CrawlDelayAndRequestRateTest):
    122     agent = 'FigTree Robot libwww-perl/5.04'
    123     # these are not actually tested, but we still need to parse it
    124     # in order to accommodate the input parameters
    125     request_rate = None
    126     crawl_delay = None
    127 
    128 
    129 class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    130     robots_txt = """\
    131 User-agent: *
    132 Disallow: /tmp/
    133 Disallow: /a%3Cd.html
    134 Disallow: /a/b.html
    135 Disallow: /%7ejoe/index.html
    136 Crawl-delay: 3
    137 Request-rate: 9/banana
    138     """
    139     good = ['/tmp']
    140     bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
    141            '/%7Ejoe/index.html']
    142     crawl_delay = 3
    143 
    144 
    145 class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
    146     # From bug report #523041
    147     robots_txt = """\
    148 User-Agent: *
    149 Disallow: /.
    150 Crawl-delay: pears
    151     """
    152     good = ['/foo.html']
    153     # bug report says "/" should be denied, but that is not in the RFC
    154     bad = []
    155 
    156 
    157 class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    158     # also test that Allow and Diasallow works well with each other
    159     robots_txt = """\
    160 User-agent: Googlebot
    161 Allow: /folder1/myfile.html
    162 Disallow: /folder1/
    163 Request-rate: whale/banana
    164     """
    165     agent = 'Googlebot'
    166     good = ['/folder1/myfile.html']
    167     bad = ['/folder1/anotherfile.html']
    168 
    169 
    170 class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
    171     # the order of User-agent should be correct. note
    172     # that this file is incorrect because "Googlebot" is a
    173     # substring of "Googlebot-Mobile"
    174     robots_txt = """\
    175 User-agent: Googlebot
    176 Disallow: /
    177 
    178 User-agent: Googlebot-Mobile
    179 Allow: /
    180     """
    181     agent = 'Googlebot'
    182     bad = ['/something.jpg']
    183 
    184 
    185 class UserAgentGoogleMobileTest(UserAgentOrderingTest):
    186     agent = 'Googlebot-Mobile'
    187 
    188 
    189 class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
    190     # Google also got the order wrong. You need
    191     # to specify the URLs from more specific to more general
    192     robots_txt = """\
    193 User-agent: Googlebot
    194 Allow: /folder1/myfile.html
    195 Disallow: /folder1/
    196     """
    197     agent = 'googlebot'
    198     good = ['/folder1/myfile.html']
    199     bad = ['/folder1/anotherfile.html']
    200 
    201 
    202 class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    203     # see issue #6325 for details
    204     robots_txt = """\
    205 User-agent: *
    206 Disallow: /some/path?name=value
    207     """
    208     good = ['/some/path']
    209     bad = ['/some/path?name=value']
    210 
    211 
    212 class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    213     # obey first * entry (#4108)
    214     robots_txt = """\
    215 User-agent: *
    216 Disallow: /some/path
    217 
    218 User-agent: *
    219 Disallow: /another/path
    220     """
    221     good = ['/another/path']
    222     bad = ['/some/path']
    223 
    224 
    225 class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
    226     # normalize the URL first (#17403)
    227     robots_txt = """\
    228 User-agent: *
    229 Allow: /some/path?
    230 Disallow: /another/path?
    231     """
    232     good = ['/some/path?']
    233     bad = ['/another/path?']
    234 
    235 
    236 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
    237     robots_txt = """\
    238 User-agent: *
    239 Crawl-delay: 1
    240 Request-rate: 3/15
    241 Disallow: /cyberworld/map/
    242     """
    243     request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
    244     crawl_delay = 1
    245     good = ['/', '/test.html']
    246     bad = ['/cyberworld/map/index.html']
    247 
    248 
    249 class RobotHandler(BaseHTTPRequestHandler):
    250 
    251     def do_GET(self):
    252         self.send_error(403, "Forbidden access")
    253 
    254     def log_message(self, format, *args):
    255         pass
    256 
    257 
    258 @unittest.skipUnless(threading, 'threading required for this test')
    259 class PasswordProtectedSiteTestCase(unittest.TestCase):
    260 
    261     def setUp(self):
    262         self.server = HTTPServer((support.HOST, 0), RobotHandler)
    263 
    264         self.t = threading.Thread(
    265             name='HTTPServer serving',
    266             target=self.server.serve_forever,
    267             # Short poll interval to make the test finish quickly.
    268             # Time between requests is short enough that we won't wake
    269             # up spuriously too many times.
    270             kwargs={'poll_interval':0.01})
    271         self.t.daemon = True  # In case this function raises.
    272         self.t.start()
    273 
    274     def tearDown(self):
    275         self.server.shutdown()
    276         self.t.join()
    277         self.server.server_close()
    278 
    279     @support.reap_threads
    280     def testPasswordProtectedSite(self):
    281         addr = self.server.server_address
    282         url = 'http://' + support.HOST + ':' + str(addr[1])
    283         robots_url = url + "/robots.txt"
    284         parser = urllib.robotparser.RobotFileParser()
    285         parser.set_url(url)
    286         parser.read()
    287         self.assertFalse(parser.can_fetch("*", robots_url))
    288 
    289 
    290 class NetworkTestCase(unittest.TestCase):
    291 
    292     base_url = 'http://www.pythontest.net/'
    293     robots_txt = '{}elsewhere/robots.txt'.format(base_url)
    294 
    295     @classmethod
    296     def setUpClass(cls):
    297         support.requires('network')
    298         with support.transient_internet(cls.base_url):
    299             cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
    300             cls.parser.read()
    301 
    302     def url(self, path):
    303         return '{}{}{}'.format(
    304             self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
    305         )
    306 
    307     def test_basic(self):
    308         self.assertFalse(self.parser.disallow_all)
    309         self.assertFalse(self.parser.allow_all)
    310         self.assertGreater(self.parser.mtime(), 0)
    311         self.assertFalse(self.parser.crawl_delay('*'))
    312         self.assertFalse(self.parser.request_rate('*'))
    313 
    314     def test_can_fetch(self):
    315         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
    316         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
    317         self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
    318         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
    319         self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
    320         self.assertTrue(self.parser.can_fetch('*', self.base_url))
    321 
    322     def test_read_404(self):
    323         parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
    324         parser.read()
    325         self.assertTrue(parser.allow_all)
    326         self.assertFalse(parser.disallow_all)
    327         self.assertEqual(parser.mtime(), 0)
    328         self.assertIsNone(parser.crawl_delay('*'))
    329         self.assertIsNone(parser.request_rate('*'))
    330 
    331 if __name__=='__main__':
    332     unittest.main()
    333