Home | History | Annotate | Download | only in test
      1 import unittest, StringIO, robotparser
      2 from test import test_support
      3 from urllib2 import urlopen, HTTPError
      4 
      5 class RobotTestCase(unittest.TestCase):
      6     def __init__(self, index, parser, url, good, agent):
      7         unittest.TestCase.__init__(self)
      8         if good:
      9             self.str = "RobotTest(%d, good, %s)" % (index, url)
     10         else:
     11             self.str = "RobotTest(%d, bad, %s)" % (index, url)
     12         self.parser = parser
     13         self.url = url
     14         self.good = good
     15         self.agent = agent
     16 
     17     def runTest(self):
     18         if isinstance(self.url, tuple):
     19             agent, url = self.url
     20         else:
     21             url = self.url
     22             agent = self.agent
     23         if self.good:
     24             self.assertTrue(self.parser.can_fetch(agent, url))
     25         else:
     26             self.assertFalse(self.parser.can_fetch(agent, url))
     27 
     28     def __str__(self):
     29         return self.str
     30 
     31 tests = unittest.TestSuite()
     32 
     33 def RobotTest(index, robots_txt, good_urls, bad_urls,
     34               agent="test_robotparser"):
     35 
     36     lines = StringIO.StringIO(robots_txt).readlines()
     37     parser = robotparser.RobotFileParser()
     38     parser.parse(lines)
     39     for url in good_urls:
     40         tests.addTest(RobotTestCase(index, parser, url, 1, agent))
     41     for url in bad_urls:
     42         tests.addTest(RobotTestCase(index, parser, url, 0, agent))
     43 
     44 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
     45 
     46 # 1.
     47 doc = """
     48 User-agent: *
     49 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     50 Disallow: /tmp/ # these will soon disappear
     51 Disallow: /foo.html
     52 """
     53 
     54 good = ['/','/test.html']
     55 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
     56 
     57 RobotTest(1, doc, good, bad)
     58 
     59 # 2.
     60 doc = """
     61 # robots.txt for http://www.example.com/
     62 
     63 User-agent: *
     64 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     65 
     66 # Cybermapper knows where to go.
     67 User-agent: cybermapper
     68 Disallow:
     69 
     70 """
     71 
     72 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
     73 bad = ['/cyberworld/map/index.html']
     74 
     75 RobotTest(2, doc, good, bad)
     76 
     77 # 3.
     78 doc = """
     79 # go away
     80 User-agent: *
     81 Disallow: /
     82 """
     83 
     84 good = []
     85 bad = ['/cyberworld/map/index.html','/','/tmp/']
     86 
     87 RobotTest(3, doc, good, bad)
     88 
     89 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
     90 
     91 # 4.
     92 doc = """
     93 User-agent: figtree
     94 Disallow: /tmp
     95 Disallow: /a%3cd.html
     96 Disallow: /a%2fb.html
     97 Disallow: /%7ejoe/index.html
     98 """
     99 
    100 good = [] # XFAIL '/a/b.html'
    101 bad = ['/tmp','/tmp.html','/tmp/a.html',
    102        '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
    103        '/~joe/index.html'
    104        ]
    105 
    106 RobotTest(4, doc, good, bad, 'figtree')
    107 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
    108 
    109 # 6.
    110 doc = """
    111 User-agent: *
    112 Disallow: /tmp/
    113 Disallow: /a%3Cd.html
    114 Disallow: /a/b.html
    115 Disallow: /%7ejoe/index.html
    116 """
    117 
    118 good = ['/tmp',] # XFAIL: '/a%2fb.html'
    119 bad = ['/tmp/','/tmp/a.html',
    120        '/a%3cd.html','/a%3Cd.html',"/a/b.html",
    121        '/%7Ejoe/index.html']
    122 
    123 RobotTest(6, doc, good, bad)
    124 
    125 # From bug report #523041
    126 
    127 # 7.
    128 doc = """
    129 User-Agent: *
    130 Disallow: /.
    131 """
    132 
    133 good = ['/foo.html']
    134 bad = [] # Bug report says "/" should be denied, but that is not in the RFC
    135 
    136 RobotTest(7, doc, good, bad)
    137 
    138 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
    139 
    140 # 8.
    141 doc = """
    142 User-agent: Googlebot
    143 Allow: /folder1/myfile.html
    144 Disallow: /folder1/
    145 """
    146 
    147 good = ['/folder1/myfile.html']
    148 bad = ['/folder1/anotherfile.html']
    149 
    150 RobotTest(8, doc, good, bad, agent="Googlebot")
    151 
    152 # 9.  This file is incorrect because "Googlebot" is a substring of
    153 #     "Googlebot-Mobile", so test 10 works just like test 9.
    154 doc = """
    155 User-agent: Googlebot
    156 Disallow: /
    157 
    158 User-agent: Googlebot-Mobile
    159 Allow: /
    160 """
    161 
    162 good = []
    163 bad = ['/something.jpg']
    164 
    165 RobotTest(9, doc, good, bad, agent="Googlebot")
    166 
    167 good = []
    168 bad = ['/something.jpg']
    169 
    170 RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
    171 
    172 # 11.  Get the order correct.
    173 doc = """
    174 User-agent: Googlebot-Mobile
    175 Allow: /
    176 
    177 User-agent: Googlebot
    178 Disallow: /
    179 """
    180 
    181 good = []
    182 bad = ['/something.jpg']
    183 
    184 RobotTest(11, doc, good, bad, agent="Googlebot")
    185 
    186 good = ['/something.jpg']
    187 bad = []
    188 
    189 RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
    190 
    191 
    192 # 13.  Google also got the order wrong in #8.  You need to specify the
    193 #      URLs from more specific to more general.
    194 doc = """
    195 User-agent: Googlebot
    196 Allow: /folder1/myfile.html
    197 Disallow: /folder1/
    198 """
    199 
    200 good = ['/folder1/myfile.html']
    201 bad = ['/folder1/anotherfile.html']
    202 
    203 RobotTest(13, doc, good, bad, agent="googlebot")
    204 
    205 
    206 # 14. For issue #6325 (query string support)
    207 doc = """
    208 User-agent: *
    209 Disallow: /some/path?name=value
    210 """
    211 
    212 good = ['/some/path']
    213 bad = ['/some/path?name=value']
    214 
    215 RobotTest(14, doc, good, bad)
    216 
    217 # 15. For issue #4108 (obey first * entry)
    218 doc = """
    219 User-agent: *
    220 Disallow: /some/path
    221 
    222 User-agent: *
    223 Disallow: /another/path
    224 """
    225 
    226 good = ['/another/path']
    227 bad = ['/some/path']
    228 
    229 RobotTest(15, doc, good, bad)
    230 
    231 
    232 class NetworkTestCase(unittest.TestCase):
    233 
    234     def testPasswordProtectedSite(self):
    235         test_support.requires('network')
    236         with test_support.transient_internet('mueblesmoraleda.com'):
    237             url = 'http://mueblesmoraleda.com'
    238             robots_url = url + "/robots.txt"
    239             # First check the URL is usable for our purposes, since the
    240             # test site is a bit flaky.
    241             try:
    242                 urlopen(robots_url)
    243             except HTTPError as e:
    244                 if e.code not in {401, 403}:
    245                     self.skipTest(
    246                         "%r should return a 401 or 403 HTTP error, not %r"
    247                         % (robots_url, e.code))
    248             else:
    249                 self.skipTest(
    250                     "%r should return a 401 or 403 HTTP error, not succeed"
    251                     % (robots_url))
    252             parser = robotparser.RobotFileParser()
    253             parser.set_url(url)
    254             try:
    255                 parser.read()
    256             except IOError:
    257                 self.skipTest('%s is unavailable' % url)
    258             self.assertEqual(parser.can_fetch("*", robots_url), False)
    259 
    260     def testPythonOrg(self):
    261         test_support.requires('network')
    262         with test_support.transient_internet('www.python.org'):
    263             parser = robotparser.RobotFileParser(
    264                 "http://www.python.org/robots.txt")
    265             parser.read()
    266             self.assertTrue(
    267                 parser.can_fetch("*", "http://www.python.org/robots.txt"))
    268 
    269 
    270 def test_main():
    271     test_support.run_unittest(tests)
    272     test_support.run_unittest(NetworkTestCase)
    273 
    274 if __name__=='__main__':
    275     test_support.verbose = 1
    276     test_main()
    277