Home | History | Annotate | Download | only in test
      1 import io
      2 import os
      3 import threading
      4 import unittest
      5 import urllib.robotparser
      6 from test import support
      7 from http.server import BaseHTTPRequestHandler, HTTPServer
      8 
      9 
     10 class BaseRobotTest:
     11     robots_txt = ''
     12     agent = 'test_robotparser'
     13     good = []
     14     bad = []
     15 
     16     def setUp(self):
     17         lines = io.StringIO(self.robots_txt).readlines()
     18         self.parser = urllib.robotparser.RobotFileParser()
     19         self.parser.parse(lines)
     20 
     21     def get_agent_and_url(self, url):
     22         if isinstance(url, tuple):
     23             agent, url = url
     24             return agent, url
     25         return self.agent, url
     26 
     27     def test_good_urls(self):
     28         for url in self.good:
     29             agent, url = self.get_agent_and_url(url)
     30             with self.subTest(url=url, agent=agent):
     31                 self.assertTrue(self.parser.can_fetch(agent, url))
     32 
     33     def test_bad_urls(self):
     34         for url in self.bad:
     35             agent, url = self.get_agent_and_url(url)
     36             with self.subTest(url=url, agent=agent):
     37                 self.assertFalse(self.parser.can_fetch(agent, url))
     38 
     39 
     40 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
     41     robots_txt = """\
     42 User-agent: *
     43 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     44 Disallow: /tmp/ # these will soon disappear
     45 Disallow: /foo.html
     46     """
     47     good = ['/', '/test.html']
     48     bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
     49 
     50 
     51 class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
     52     robots_txt = """\
     53 # robots.txt for http://www.example.com/
     54 
     55 User-agent: *
     56 Crawl-delay: 1
     57 Request-rate: 3/15
     58 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     59 
     60 # Cybermapper knows where to go.
     61 User-agent: cybermapper
     62 Disallow:
     63     """
     64     good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
     65     bad = ['/cyberworld/map/index.html']
     66 
     67 
     68 class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
     69     robots_txt = """\
     70 # go away
     71 User-agent: *
     72 Disallow: /
     73     """
     74     good = []
     75     bad = ['/cyberworld/map/index.html', '/', '/tmp/']
     76 
     77 
     78 class BaseRequestRateTest(BaseRobotTest):
     79 
     80     def test_request_rate(self):
     81         for url in self.good + self.bad:
     82             agent, url = self.get_agent_and_url(url)
     83             with self.subTest(url=url, agent=agent):
     84                 if self.crawl_delay:
     85                     self.assertEqual(
     86                         self.parser.crawl_delay(agent), self.crawl_delay
     87                     )
     88                 if self.request_rate:
     89                     self.assertIsInstance(
     90                         self.parser.request_rate(agent),
     91                         urllib.robotparser.RequestRate
     92                     )
     93                     self.assertEqual(
     94                         self.parser.request_rate(agent).requests,
     95                         self.request_rate.requests
     96                     )
     97                     self.assertEqual(
     98                         self.parser.request_rate(agent).seconds,
     99                         self.request_rate.seconds
    100                     )
    101 
    102 
    103 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
    104     robots_txt = """\
    105 User-agent: figtree
    106 Crawl-delay: 3
    107 Request-rate: 9/30
    108 Disallow: /tmp
    109 Disallow: /a%3cd.html
    110 Disallow: /a%2fb.html
    111 Disallow: /%7ejoe/index.html
    112     """
    113     agent = 'figtree'
    114     request_rate = urllib.robotparser.RequestRate(9, 30)
    115     crawl_delay = 3
    116     good = [('figtree', '/foo.html')]
    117     bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
    118            '/a%2fb.html', '/~joe/index.html']
    119 
    120 
    121 class DifferentAgentTest(CrawlDelayAndRequestRateTest):
    122     agent = 'FigTree Robot libwww-perl/5.04'
    123     # these are not actually tested, but we still need to parse it
    124     # in order to accommodate the input parameters
    125     request_rate = None
    126     crawl_delay = None
    127 
    128 
    129 class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    130     robots_txt = """\
    131 User-agent: *
    132 Disallow: /tmp/
    133 Disallow: /a%3Cd.html
    134 Disallow: /a/b.html
    135 Disallow: /%7ejoe/index.html
    136 Crawl-delay: 3
    137 Request-rate: 9/banana
    138     """
    139     good = ['/tmp']
    140     bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
    141            '/%7Ejoe/index.html']
    142     crawl_delay = 3
    143 
    144 
    145 class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
    146     # From bug report #523041
    147     robots_txt = """\
    148 User-Agent: *
    149 Disallow: /.
    150 Crawl-delay: pears
    151     """
    152     good = ['/foo.html']
    153     # bug report says "/" should be denied, but that is not in the RFC
    154     bad = []
    155 
    156 
    157 class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    158     # also test that Allow and Diasallow works well with each other
    159     robots_txt = """\
    160 User-agent: Googlebot
    161 Allow: /folder1/myfile.html
    162 Disallow: /folder1/
    163 Request-rate: whale/banana
    164     """
    165     agent = 'Googlebot'
    166     good = ['/folder1/myfile.html']
    167     bad = ['/folder1/anotherfile.html']
    168 
    169 
    170 class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
    171     # the order of User-agent should be correct. note
    172     # that this file is incorrect because "Googlebot" is a
    173     # substring of "Googlebot-Mobile"
    174     robots_txt = """\
    175 User-agent: Googlebot
    176 Disallow: /
    177 
    178 User-agent: Googlebot-Mobile
    179 Allow: /
    180     """
    181     agent = 'Googlebot'
    182     bad = ['/something.jpg']
    183 
    184 
    185 class UserAgentGoogleMobileTest(UserAgentOrderingTest):
    186     agent = 'Googlebot-Mobile'
    187 
    188 
    189 class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
    190     # Google also got the order wrong. You need
    191     # to specify the URLs from more specific to more general
    192     robots_txt = """\
    193 User-agent: Googlebot
    194 Allow: /folder1/myfile.html
    195 Disallow: /folder1/
    196     """
    197     agent = 'googlebot'
    198     good = ['/folder1/myfile.html']
    199     bad = ['/folder1/anotherfile.html']
    200 
    201 
    202 class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    203     # see issue #6325 for details
    204     robots_txt = """\
    205 User-agent: *
    206 Disallow: /some/path?name=value
    207     """
    208     good = ['/some/path']
    209     bad = ['/some/path?name=value']
    210 
    211 
    212 class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    213     # obey first * entry (#4108)
    214     robots_txt = """\
    215 User-agent: *
    216 Disallow: /some/path
    217 
    218 User-agent: *
    219 Disallow: /another/path
    220     """
    221     good = ['/another/path']
    222     bad = ['/some/path']
    223 
    224 
    225 class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
    226     # normalize the URL first (#17403)
    227     robots_txt = """\
    228 User-agent: *
    229 Allow: /some/path?
    230 Disallow: /another/path?
    231     """
    232     good = ['/some/path?']
    233     bad = ['/another/path?']
    234 
    235 
    236 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
    237     robots_txt = """\
    238 User-agent: *
    239 Crawl-delay: 1
    240 Request-rate: 3/15
    241 Disallow: /cyberworld/map/
    242     """
    243     request_rate = urllib.robotparser.RequestRate(3, 15)
    244     crawl_delay = 1
    245     good = ['/', '/test.html']
    246     bad = ['/cyberworld/map/index.html']
    247 
    248 
    249 class StringFormattingTest(BaseRobotTest, unittest.TestCase):
    250     robots_txt = """\
    251 User-agent: *
    252 Crawl-delay: 1
    253 Request-rate: 3/15
    254 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
    255 
    256 # Cybermapper knows where to go.
    257 User-agent: cybermapper
    258 Disallow: /some/path
    259     """
    260 
    261     expected_output = """\
    262 User-agent: cybermapper
    263 Disallow: /some/path
    264 
    265 User-agent: *
    266 Crawl-delay: 1
    267 Request-rate: 3/15
    268 Disallow: /cyberworld/map/
    269 
    270 """
    271 
    272     def test_string_formatting(self):
    273         self.assertEqual(str(self.parser), self.expected_output)
    274 
    275 
    276 class RobotHandler(BaseHTTPRequestHandler):
    277 
    278     def do_GET(self):
    279         self.send_error(403, "Forbidden access")
    280 
    281     def log_message(self, format, *args):
    282         pass
    283 
    284 
    285 class PasswordProtectedSiteTestCase(unittest.TestCase):
    286 
    287     def setUp(self):
    288         self.server = HTTPServer((support.HOST, 0), RobotHandler)
    289 
    290         self.t = threading.Thread(
    291             name='HTTPServer serving',
    292             target=self.server.serve_forever,
    293             # Short poll interval to make the test finish quickly.
    294             # Time between requests is short enough that we won't wake
    295             # up spuriously too many times.
    296             kwargs={'poll_interval':0.01})
    297         self.t.daemon = True  # In case this function raises.
    298         self.t.start()
    299 
    300     def tearDown(self):
    301         self.server.shutdown()
    302         self.t.join()
    303         self.server.server_close()
    304 
    305     @support.reap_threads
    306     def testPasswordProtectedSite(self):
    307         addr = self.server.server_address
    308         url = 'http://' + support.HOST + ':' + str(addr[1])
    309         robots_url = url + "/robots.txt"
    310         parser = urllib.robotparser.RobotFileParser()
    311         parser.set_url(url)
    312         parser.read()
    313         self.assertFalse(parser.can_fetch("*", robots_url))
    314 
    315 
    316 class NetworkTestCase(unittest.TestCase):
    317 
    318     base_url = 'http://www.pythontest.net/'
    319     robots_txt = '{}elsewhere/robots.txt'.format(base_url)
    320 
    321     @classmethod
    322     def setUpClass(cls):
    323         support.requires('network')
    324         with support.transient_internet(cls.base_url):
    325             cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
    326             cls.parser.read()
    327 
    328     def url(self, path):
    329         return '{}{}{}'.format(
    330             self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
    331         )
    332 
    333     def test_basic(self):
    334         self.assertFalse(self.parser.disallow_all)
    335         self.assertFalse(self.parser.allow_all)
    336         self.assertGreater(self.parser.mtime(), 0)
    337         self.assertFalse(self.parser.crawl_delay('*'))
    338         self.assertFalse(self.parser.request_rate('*'))
    339 
    340     def test_can_fetch(self):
    341         self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
    342         self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
    343         self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
    344         self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
    345         self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
    346         self.assertTrue(self.parser.can_fetch('*', self.base_url))
    347 
    348     def test_read_404(self):
    349         parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
    350         parser.read()
    351         self.assertTrue(parser.allow_all)
    352         self.assertFalse(parser.disallow_all)
    353         self.assertEqual(parser.mtime(), 0)
    354         self.assertIsNone(parser.crawl_delay('*'))
    355         self.assertIsNone(parser.request_rate('*'))
    356 
    357 if __name__=='__main__':
    358     unittest.main()
    359