Home | History | Annotate | Download | only in test
      1 import unittest, StringIO, robotparser
      2 from test import test_support
      3 
      4 class RobotTestCase(unittest.TestCase):
      5     def __init__(self, index, parser, url, good, agent):
      6         unittest.TestCase.__init__(self)
      7         if good:
      8             self.str = "RobotTest(%d, good, %s)" % (index, url)
      9         else:
     10             self.str = "RobotTest(%d, bad, %s)" % (index, url)
     11         self.parser = parser
     12         self.url = url
     13         self.good = good
     14         self.agent = agent
     15 
     16     def runTest(self):
     17         if isinstance(self.url, tuple):
     18             agent, url = self.url
     19         else:
     20             url = self.url
     21             agent = self.agent
     22         if self.good:
     23             self.assertTrue(self.parser.can_fetch(agent, url))
     24         else:
     25             self.assertFalse(self.parser.can_fetch(agent, url))
     26 
     27     def __str__(self):
     28         return self.str
     29 
     30 tests = unittest.TestSuite()
     31 
     32 def RobotTest(index, robots_txt, good_urls, bad_urls,
     33               agent="test_robotparser"):
     34 
     35     lines = StringIO.StringIO(robots_txt).readlines()
     36     parser = robotparser.RobotFileParser()
     37     parser.parse(lines)
     38     for url in good_urls:
     39         tests.addTest(RobotTestCase(index, parser, url, 1, agent))
     40     for url in bad_urls:
     41         tests.addTest(RobotTestCase(index, parser, url, 0, agent))
     42 
     43 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

     44 
     45 # 1.

     46 doc = """
     47 User-agent: *
     48 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     49 Disallow: /tmp/ # these will soon disappear
     50 Disallow: /foo.html
     51 """
     52 
     53 good = ['/','/test.html']
     54 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
     55 
     56 RobotTest(1, doc, good, bad)
     57 
     58 # 2.

     59 doc = """
     60 # robots.txt for http://www.example.com/
     61 
     62 User-agent: *
     63 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     64 
     65 # Cybermapper knows where to go.
     66 User-agent: cybermapper
     67 Disallow:
     68 
     69 """
     70 
     71 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
     72 bad = ['/cyberworld/map/index.html']
     73 
     74 RobotTest(2, doc, good, bad)
     75 
     76 # 3.

     77 doc = """
     78 # go away
     79 User-agent: *
     80 Disallow: /
     81 """
     82 
     83 good = []
     84 bad = ['/cyberworld/map/index.html','/','/tmp/']
     85 
     86 RobotTest(3, doc, good, bad)
     87 
     88 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)

     89 
     90 # 4.

     91 doc = """
     92 User-agent: figtree
     93 Disallow: /tmp
     94 Disallow: /a%3cd.html
     95 Disallow: /a%2fb.html
     96 Disallow: /%7ejoe/index.html
     97 """
     98 
     99 good = [] # XFAIL '/a/b.html'

    100 bad = ['/tmp','/tmp.html','/tmp/a.html',
    101        '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
    102        '/~joe/index.html'
    103        ]
    104 
    105 RobotTest(4, doc, good, bad, 'figtree')
    106 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
    107 
    108 # 6.

    109 doc = """
    110 User-agent: *
    111 Disallow: /tmp/
    112 Disallow: /a%3Cd.html
    113 Disallow: /a/b.html
    114 Disallow: /%7ejoe/index.html
    115 """
    116 
    117 good = ['/tmp',] # XFAIL: '/a%2fb.html'

    118 bad = ['/tmp/','/tmp/a.html',
    119        '/a%3cd.html','/a%3Cd.html',"/a/b.html",
    120        '/%7Ejoe/index.html']
    121 
    122 RobotTest(6, doc, good, bad)
    123 
    124 # From bug report #523041

    125 
    126 # 7.

    127 doc = """
    128 User-Agent: *
    129 Disallow: /.
    130 """
    131 
    132 good = ['/foo.html']
    133 bad = [] # Bug report says "/" should be denied, but that is not in the RFC

    134 
    135 RobotTest(7, doc, good, bad)
    136 
    137 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364

    138 
    139 # 8.

    140 doc = """
    141 User-agent: Googlebot
    142 Allow: /folder1/myfile.html
    143 Disallow: /folder1/
    144 """
    145 
    146 good = ['/folder1/myfile.html']
    147 bad = ['/folder1/anotherfile.html']
    148 
    149 RobotTest(8, doc, good, bad, agent="Googlebot")
    150 
    151 # 9.  This file is incorrect because "Googlebot" is a substring of

    152 #     "Googlebot-Mobile", so test 10 works just like test 9.

    153 doc = """
    154 User-agent: Googlebot
    155 Disallow: /
    156 
    157 User-agent: Googlebot-Mobile
    158 Allow: /
    159 """
    160 
    161 good = []
    162 bad = ['/something.jpg']
    163 
    164 RobotTest(9, doc, good, bad, agent="Googlebot")
    165 
    166 good = []
    167 bad = ['/something.jpg']
    168 
    169 RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
    170 
    171 # 11.  Get the order correct.

    172 doc = """
    173 User-agent: Googlebot-Mobile
    174 Allow: /
    175 
    176 User-agent: Googlebot
    177 Disallow: /
    178 """
    179 
    180 good = []
    181 bad = ['/something.jpg']
    182 
    183 RobotTest(11, doc, good, bad, agent="Googlebot")
    184 
    185 good = ['/something.jpg']
    186 bad = []
    187 
    188 RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
    189 
    190 
    191 # 13.  Google also got the order wrong in #8.  You need to specify the

    192 #      URLs from more specific to more general.

    193 doc = """
    194 User-agent: Googlebot
    195 Allow: /folder1/myfile.html
    196 Disallow: /folder1/
    197 """
    198 
    199 good = ['/folder1/myfile.html']
    200 bad = ['/folder1/anotherfile.html']
    201 
    202 RobotTest(13, doc, good, bad, agent="googlebot")
    203 
    204 
    205 # 14. For issue #6325 (query string support)

    206 doc = """
    207 User-agent: *
    208 Disallow: /some/path?name=value
    209 """
    210 
    211 good = ['/some/path']
    212 bad = ['/some/path?name=value']
    213 
    214 RobotTest(14, doc, good, bad)
    215 
    216 # 15. For issue #4108 (obey first * entry)

    217 doc = """
    218 User-agent: *
    219 Disallow: /some/path
    220 
    221 User-agent: *
    222 Disallow: /another/path
    223 """
    224 
    225 good = ['/another/path']
    226 bad = ['/some/path']
    227 
    228 RobotTest(15, doc, good, bad)
    229 
    230 
    231 class NetworkTestCase(unittest.TestCase):
    232 
    233     def testPasswordProtectedSite(self):
    234         test_support.requires('network')
    235         with test_support.transient_internet('mueblesmoraleda.com'):
    236             url = 'http://mueblesmoraleda.com'
    237             parser = robotparser.RobotFileParser()
    238             parser.set_url(url)
    239             try:
    240                 parser.read()
    241             except IOError:
    242                 self.skipTest('%s is unavailable' % url)
    243             self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
    244 
    245     def testPythonOrg(self):
    246         test_support.requires('network')
    247         with test_support.transient_internet('www.python.org'):
    248             parser = robotparser.RobotFileParser(
    249                 "http://www.python.org/robots.txt")
    250             parser.read()
    251             self.assertTrue(
    252                 parser.can_fetch("*", "http://www.python.org/robots.txt"))
    253 
    254 
    255 def test_main():
    256     test_support.run_unittest(tests)
    257     test_support.run_unittest(NetworkTestCase)
    258 
    259 if __name__=='__main__':
    260     test_support.verbose = 1
    261     test_main()
    262