1 import unittest, StringIO, robotparser 2 from test import test_support 3 from urllib2 import urlopen, HTTPError 4 5 class RobotTestCase(unittest.TestCase): 6 def __init__(self, index, parser, url, good, agent): 7 unittest.TestCase.__init__(self) 8 if good: 9 self.str = "RobotTest(%d, good, %s)" % (index, url) 10 else: 11 self.str = "RobotTest(%d, bad, %s)" % (index, url) 12 self.parser = parser 13 self.url = url 14 self.good = good 15 self.agent = agent 16 17 def runTest(self): 18 if isinstance(self.url, tuple): 19 agent, url = self.url 20 else: 21 url = self.url 22 agent = self.agent 23 if self.good: 24 self.assertTrue(self.parser.can_fetch(agent, url)) 25 else: 26 self.assertFalse(self.parser.can_fetch(agent, url)) 27 28 def __str__(self): 29 return self.str 30 31 tests = unittest.TestSuite() 32 33 def RobotTest(index, robots_txt, good_urls, bad_urls, 34 agent="test_robotparser"): 35 36 lines = StringIO.StringIO(robots_txt).readlines() 37 parser = robotparser.RobotFileParser() 38 parser.parse(lines) 39 for url in good_urls: 40 tests.addTest(RobotTestCase(index, parser, url, 1, agent)) 41 for url in bad_urls: 42 tests.addTest(RobotTestCase(index, parser, url, 0, agent)) 43 44 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) 45 46 # 1. 47 doc = """ 48 User-agent: * 49 Disallow: /cyberworld/map/ # This is an infinite virtual URL space 50 Disallow: /tmp/ # these will soon disappear 51 Disallow: /foo.html 52 """ 53 54 good = ['/','/test.html'] 55 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] 56 57 RobotTest(1, doc, good, bad) 58 59 # 2. 60 doc = """ 61 # robots.txt for http://www.example.com/ 62 63 User-agent: * 64 Disallow: /cyberworld/map/ # This is an infinite virtual URL space 65 66 # Cybermapper knows where to go. 67 User-agent: cybermapper 68 Disallow: 69 70 """ 71 72 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] 73 bad = ['/cyberworld/map/index.html'] 74 75 RobotTest(2, doc, good, bad) 76 77 # 3. 78 doc = """ 79 # go away 80 User-agent: * 81 Disallow: / 82 """ 83 84 good = [] 85 bad = ['/cyberworld/map/index.html','/','/tmp/'] 86 87 RobotTest(3, doc, good, bad) 88 89 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) 90 91 # 4. 92 doc = """ 93 User-agent: figtree 94 Disallow: /tmp 95 Disallow: /a%3cd.html 96 Disallow: /a%2fb.html 97 Disallow: /%7ejoe/index.html 98 """ 99 100 good = [] # XFAIL '/a/b.html' 101 bad = ['/tmp','/tmp.html','/tmp/a.html', 102 '/a%3cd.html','/a%3Cd.html','/a%2fb.html', 103 '/~joe/index.html' 104 ] 105 106 RobotTest(4, doc, good, bad, 'figtree') 107 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') 108 109 # 6. 110 doc = """ 111 User-agent: * 112 Disallow: /tmp/ 113 Disallow: /a%3Cd.html 114 Disallow: /a/b.html 115 Disallow: /%7ejoe/index.html 116 """ 117 118 good = ['/tmp',] # XFAIL: '/a%2fb.html' 119 bad = ['/tmp/','/tmp/a.html', 120 '/a%3cd.html','/a%3Cd.html',"/a/b.html", 121 '/%7Ejoe/index.html'] 122 123 RobotTest(6, doc, good, bad) 124 125 # From bug report #523041 126 127 # 7. 128 doc = """ 129 User-Agent: * 130 Disallow: /. 131 """ 132 133 good = ['/foo.html'] 134 bad = [] # Bug report says "/" should be denied, but that is not in the RFC 135 136 RobotTest(7, doc, good, bad) 137 138 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 139 140 # 8. 141 doc = """ 142 User-agent: Googlebot 143 Allow: /folder1/myfile.html 144 Disallow: /folder1/ 145 """ 146 147 good = ['/folder1/myfile.html'] 148 bad = ['/folder1/anotherfile.html'] 149 150 RobotTest(8, doc, good, bad, agent="Googlebot") 151 152 # 9. This file is incorrect because "Googlebot" is a substring of 153 # "Googlebot-Mobile", so test 10 works just like test 9. 154 doc = """ 155 User-agent: Googlebot 156 Disallow: / 157 158 User-agent: Googlebot-Mobile 159 Allow: / 160 """ 161 162 good = [] 163 bad = ['/something.jpg'] 164 165 RobotTest(9, doc, good, bad, agent="Googlebot") 166 167 good = [] 168 bad = ['/something.jpg'] 169 170 RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") 171 172 # 11. Get the order correct. 173 doc = """ 174 User-agent: Googlebot-Mobile 175 Allow: / 176 177 User-agent: Googlebot 178 Disallow: / 179 """ 180 181 good = [] 182 bad = ['/something.jpg'] 183 184 RobotTest(11, doc, good, bad, agent="Googlebot") 185 186 good = ['/something.jpg'] 187 bad = [] 188 189 RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") 190 191 192 # 13. Google also got the order wrong in #8. You need to specify the 193 # URLs from more specific to more general. 194 doc = """ 195 User-agent: Googlebot 196 Allow: /folder1/myfile.html 197 Disallow: /folder1/ 198 """ 199 200 good = ['/folder1/myfile.html'] 201 bad = ['/folder1/anotherfile.html'] 202 203 RobotTest(13, doc, good, bad, agent="googlebot") 204 205 206 # 14. For issue #6325 (query string support) 207 doc = """ 208 User-agent: * 209 Disallow: /some/path?name=value 210 """ 211 212 good = ['/some/path'] 213 bad = ['/some/path?name=value'] 214 215 RobotTest(14, doc, good, bad) 216 217 # 15. For issue #4108 (obey first * entry) 218 doc = """ 219 User-agent: * 220 Disallow: /some/path 221 222 User-agent: * 223 Disallow: /another/path 224 """ 225 226 good = ['/another/path'] 227 bad = ['/some/path'] 228 229 RobotTest(15, doc, good, bad) 230 231 232 class NetworkTestCase(unittest.TestCase): 233 234 def testPasswordProtectedSite(self): 235 test_support.requires('network') 236 with test_support.transient_internet('mueblesmoraleda.com'): 237 url = 'http://mueblesmoraleda.com' 238 robots_url = url + "/robots.txt" 239 # First check the URL is usable for our purposes, since the 240 # test site is a bit flaky. 241 try: 242 urlopen(robots_url) 243 except HTTPError as e: 244 if e.code not in {401, 403}: 245 self.skipTest( 246 "%r should return a 401 or 403 HTTP error, not %r" 247 % (robots_url, e.code)) 248 else: 249 self.skipTest( 250 "%r should return a 401 or 403 HTTP error, not succeed" 251 % (robots_url)) 252 parser = robotparser.RobotFileParser() 253 parser.set_url(url) 254 try: 255 parser.read() 256 except IOError: 257 self.skipTest('%s is unavailable' % url) 258 self.assertEqual(parser.can_fetch("*", robots_url), False) 259 260 def testPythonOrg(self): 261 test_support.requires('network') 262 with test_support.transient_internet('www.python.org'): 263 parser = robotparser.RobotFileParser( 264 "http://www.python.org/robots.txt") 265 parser.read() 266 self.assertTrue( 267 parser.can_fetch("*", "http://www.python.org/robots.txt")) 268 269 270 def test_main(): 271 test_support.run_unittest(tests) 272 test_support.run_unittest(NetworkTestCase) 273 274 if __name__=='__main__': 275 test_support.verbose = 1 276 test_main() 277