1 import io 2 import os 3 import threading 4 import unittest 5 import urllib.robotparser 6 from test import support 7 from http.server import BaseHTTPRequestHandler, HTTPServer 8 9 10 class BaseRobotTest: 11 robots_txt = '' 12 agent = 'test_robotparser' 13 good = [] 14 bad = [] 15 16 def setUp(self): 17 lines = io.StringIO(self.robots_txt).readlines() 18 self.parser = urllib.robotparser.RobotFileParser() 19 self.parser.parse(lines) 20 21 def get_agent_and_url(self, url): 22 if isinstance(url, tuple): 23 agent, url = url 24 return agent, url 25 return self.agent, url 26 27 def test_good_urls(self): 28 for url in self.good: 29 agent, url = self.get_agent_and_url(url) 30 with self.subTest(url=url, agent=agent): 31 self.assertTrue(self.parser.can_fetch(agent, url)) 32 33 def test_bad_urls(self): 34 for url in self.bad: 35 agent, url = self.get_agent_and_url(url) 36 with self.subTest(url=url, agent=agent): 37 self.assertFalse(self.parser.can_fetch(agent, url)) 38 39 40 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 41 robots_txt = """\ 42 User-agent: * 43 Disallow: /cyberworld/map/ # This is an infinite virtual URL space 44 Disallow: /tmp/ # these will soon disappear 45 Disallow: /foo.html 46 """ 47 good = ['/', '/test.html'] 48 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 49 50 51 class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): 52 robots_txt = """\ 53 # robots.txt for http://www.example.com/ 54 55 User-agent: * 56 Crawl-delay: 1 57 Request-rate: 3/15 58 Disallow: /cyberworld/map/ # This is an infinite virtual URL space 59 60 # Cybermapper knows where to go. 61 User-agent: cybermapper 62 Disallow: 63 """ 64 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] 65 bad = ['/cyberworld/map/index.html'] 66 67 68 class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): 69 robots_txt = """\ 70 # go away 71 User-agent: * 72 Disallow: / 73 """ 74 good = [] 75 bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 76 77 78 class BaseRequestRateTest(BaseRobotTest): 79 80 def test_request_rate(self): 81 for url in self.good + self.bad: 82 agent, url = self.get_agent_and_url(url) 83 with self.subTest(url=url, agent=agent): 84 if self.crawl_delay: 85 self.assertEqual( 86 self.parser.crawl_delay(agent), self.crawl_delay 87 ) 88 if self.request_rate: 89 self.assertIsInstance( 90 self.parser.request_rate(agent), 91 urllib.robotparser.RequestRate 92 ) 93 self.assertEqual( 94 self.parser.request_rate(agent).requests, 95 self.request_rate.requests 96 ) 97 self.assertEqual( 98 self.parser.request_rate(agent).seconds, 99 self.request_rate.seconds 100 ) 101 102 103 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): 104 robots_txt = """\ 105 User-agent: figtree 106 Crawl-delay: 3 107 Request-rate: 9/30 108 Disallow: /tmp 109 Disallow: /a%3cd.html 110 Disallow: /a%2fb.html 111 Disallow: /%7ejoe/index.html 112 """ 113 agent = 'figtree' 114 request_rate = urllib.robotparser.RequestRate(9, 30) 115 crawl_delay = 3 116 good = [('figtree', '/foo.html')] 117 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 118 '/a%2fb.html', '/~joe/index.html'] 119 120 121 class DifferentAgentTest(CrawlDelayAndRequestRateTest): 122 agent = 'FigTree Robot libwww-perl/5.04' 123 # these are not actually tested, but we still need to parse it 124 # in order to accommodate the input parameters 125 request_rate = None 126 crawl_delay = None 127 128 129 class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 130 robots_txt = """\ 131 User-agent: * 132 Disallow: /tmp/ 133 Disallow: /a%3Cd.html 134 Disallow: /a/b.html 135 Disallow: /%7ejoe/index.html 136 Crawl-delay: 3 137 Request-rate: 9/banana 138 """ 139 good = ['/tmp'] 140 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', 141 '/%7Ejoe/index.html'] 142 crawl_delay = 3 143 144 145 class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase): 146 # From bug report #523041 147 robots_txt = """\ 148 User-Agent: * 149 Disallow: /. 150 Crawl-delay: pears 151 """ 152 good = ['/foo.html'] 153 # bug report says "/" should be denied, but that is not in the RFC 154 bad = [] 155 156 157 class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 158 # also test that Allow and Diasallow works well with each other 159 robots_txt = """\ 160 User-agent: Googlebot 161 Allow: /folder1/myfile.html 162 Disallow: /folder1/ 163 Request-rate: whale/banana 164 """ 165 agent = 'Googlebot' 166 good = ['/folder1/myfile.html'] 167 bad = ['/folder1/anotherfile.html'] 168 169 170 class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): 171 # the order of User-agent should be correct. note 172 # that this file is incorrect because "Googlebot" is a 173 # substring of "Googlebot-Mobile" 174 robots_txt = """\ 175 User-agent: Googlebot 176 Disallow: / 177 178 User-agent: Googlebot-Mobile 179 Allow: / 180 """ 181 agent = 'Googlebot' 182 bad = ['/something.jpg'] 183 184 185 class UserAgentGoogleMobileTest(UserAgentOrderingTest): 186 agent = 'Googlebot-Mobile' 187 188 189 class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): 190 # Google also got the order wrong. You need 191 # to specify the URLs from more specific to more general 192 robots_txt = """\ 193 User-agent: Googlebot 194 Allow: /folder1/myfile.html 195 Disallow: /folder1/ 196 """ 197 agent = 'googlebot' 198 good = ['/folder1/myfile.html'] 199 bad = ['/folder1/anotherfile.html'] 200 201 202 class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): 203 # see issue #6325 for details 204 robots_txt = """\ 205 User-agent: * 206 Disallow: /some/path?name=value 207 """ 208 good = ['/some/path'] 209 bad = ['/some/path?name=value'] 210 211 212 class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 213 # obey first * entry (#4108) 214 robots_txt = """\ 215 User-agent: * 216 Disallow: /some/path 217 218 User-agent: * 219 Disallow: /another/path 220 """ 221 good = ['/another/path'] 222 bad = ['/some/path'] 223 224 225 class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): 226 # normalize the URL first (#17403) 227 robots_txt = """\ 228 User-agent: * 229 Allow: /some/path? 230 Disallow: /another/path? 231 """ 232 good = ['/some/path?'] 233 bad = ['/another/path?'] 234 235 236 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): 237 robots_txt = """\ 238 User-agent: * 239 Crawl-delay: 1 240 Request-rate: 3/15 241 Disallow: /cyberworld/map/ 242 """ 243 request_rate = urllib.robotparser.RequestRate(3, 15) 244 crawl_delay = 1 245 good = ['/', '/test.html'] 246 bad = ['/cyberworld/map/index.html'] 247 248 249 class StringFormattingTest(BaseRobotTest, unittest.TestCase): 250 robots_txt = """\ 251 User-agent: * 252 Crawl-delay: 1 253 Request-rate: 3/15 254 Disallow: /cyberworld/map/ # This is an infinite virtual URL space 255 256 # Cybermapper knows where to go. 257 User-agent: cybermapper 258 Disallow: /some/path 259 """ 260 261 expected_output = """\ 262 User-agent: cybermapper 263 Disallow: /some/path 264 265 User-agent: * 266 Crawl-delay: 1 267 Request-rate: 3/15 268 Disallow: /cyberworld/map/ 269 270 """ 271 272 def test_string_formatting(self): 273 self.assertEqual(str(self.parser), self.expected_output) 274 275 276 class RobotHandler(BaseHTTPRequestHandler): 277 278 def do_GET(self): 279 self.send_error(403, "Forbidden access") 280 281 def log_message(self, format, *args): 282 pass 283 284 285 class PasswordProtectedSiteTestCase(unittest.TestCase): 286 287 def setUp(self): 288 self.server = HTTPServer((support.HOST, 0), RobotHandler) 289 290 self.t = threading.Thread( 291 name='HTTPServer serving', 292 target=self.server.serve_forever, 293 # Short poll interval to make the test finish quickly. 294 # Time between requests is short enough that we won't wake 295 # up spuriously too many times. 296 kwargs={'poll_interval':0.01}) 297 self.t.daemon = True # In case this function raises. 298 self.t.start() 299 300 def tearDown(self): 301 self.server.shutdown() 302 self.t.join() 303 self.server.server_close() 304 305 @support.reap_threads 306 def testPasswordProtectedSite(self): 307 addr = self.server.server_address 308 url = 'http://' + support.HOST + ':' + str(addr[1]) 309 robots_url = url + "/robots.txt" 310 parser = urllib.robotparser.RobotFileParser() 311 parser.set_url(url) 312 parser.read() 313 self.assertFalse(parser.can_fetch("*", robots_url)) 314 315 316 class NetworkTestCase(unittest.TestCase): 317 318 base_url = 'http://www.pythontest.net/' 319 robots_txt = '{}elsewhere/robots.txt'.format(base_url) 320 321 @classmethod 322 def setUpClass(cls): 323 support.requires('network') 324 with support.transient_internet(cls.base_url): 325 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt) 326 cls.parser.read() 327 328 def url(self, path): 329 return '{}{}{}'.format( 330 self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 331 ) 332 333 def test_basic(self): 334 self.assertFalse(self.parser.disallow_all) 335 self.assertFalse(self.parser.allow_all) 336 self.assertGreater(self.parser.mtime(), 0) 337 self.assertFalse(self.parser.crawl_delay('*')) 338 self.assertFalse(self.parser.request_rate('*')) 339 340 def test_can_fetch(self): 341 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 342 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 343 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) 344 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 345 self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 346 self.assertTrue(self.parser.can_fetch('*', self.base_url)) 347 348 def test_read_404(self): 349 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt')) 350 parser.read() 351 self.assertTrue(parser.allow_all) 352 self.assertFalse(parser.disallow_all) 353 self.assertEqual(parser.mtime(), 0) 354 self.assertIsNone(parser.crawl_delay('*')) 355 self.assertIsNone(parser.request_rate('*')) 356 357 if __name__=='__main__': 358 unittest.main() 359