1 """ robotparser.py 2 3 Copyright (C) 2000 Bastian Kleineidam 4 5 You can choose between two licenses when using this package: 6 1) GNU GPLv2 7 2) PSF license for Python 2.2 8 9 The robots.txt Exclusion Protocol is implemented as specified in 10 http://www.robotstxt.org/norobots-rfc.txt 11 """ 12 13 import collections 14 import urllib.parse 15 import urllib.request 16 17 __all__ = ["RobotFileParser"] 18 19 class RobotFileParser: 20 """ This class provides a set of methods to read, parse and answer 21 questions about a single robots.txt file. 22 23 """ 24 25 def __init__(self, url=''): 26 self.entries = [] 27 self.default_entry = None 28 self.disallow_all = False 29 self.allow_all = False 30 self.set_url(url) 31 self.last_checked = 0 32 33 def mtime(self): 34 """Returns the time the robots.txt file was last fetched. 35 36 This is useful for long-running web spiders that need to 37 check for new robots.txt files periodically. 38 39 """ 40 return self.last_checked 41 42 def modified(self): 43 """Sets the time the robots.txt file was last fetched to the 44 current time. 45 46 """ 47 import time 48 self.last_checked = time.time() 49 50 def set_url(self, url): 51 """Sets the URL referring to a robots.txt file.""" 52 self.url = url 53 self.host, self.path = urllib.parse.urlparse(url)[1:3] 54 55 def read(self): 56 """Reads the robots.txt URL and feeds it to the parser.""" 57 try: 58 f = urllib.request.urlopen(self.url) 59 except urllib.error.HTTPError as err: 60 if err.code in (401, 403): 61 self.disallow_all = True 62 elif err.code >= 400 and err.code < 500: 63 self.allow_all = True 64 else: 65 raw = f.read() 66 self.parse(raw.decode("utf-8").splitlines()) 67 68 def _add_entry(self, entry): 69 if "*" in entry.useragents: 70 # the default entry is considered last 71 if self.default_entry is None: 72 # the first default entry wins 73 self.default_entry = entry 74 else: 75 self.entries.append(entry) 76 77 def parse(self, lines): 78 """Parse the input lines from a robots.txt file. 79 80 We allow that a user-agent: line is not preceded by 81 one or more blank lines. 82 """ 83 # states: 84 # 0: start state 85 # 1: saw user-agent line 86 # 2: saw an allow or disallow line 87 state = 0 88 entry = Entry() 89 90 self.modified() 91 for line in lines: 92 if not line: 93 if state == 1: 94 entry = Entry() 95 state = 0 96 elif state == 2: 97 self._add_entry(entry) 98 entry = Entry() 99 state = 0 100 # remove optional comment and strip line 101 i = line.find('#') 102 if i >= 0: 103 line = line[:i] 104 line = line.strip() 105 if not line: 106 continue 107 line = line.split(':', 1) 108 if len(line) == 2: 109 line[0] = line[0].strip().lower() 110 line[1] = urllib.parse.unquote(line[1].strip()) 111 if line[0] == "user-agent": 112 if state == 2: 113 self._add_entry(entry) 114 entry = Entry() 115 entry.useragents.append(line[1]) 116 state = 1 117 elif line[0] == "disallow": 118 if state != 0: 119 entry.rulelines.append(RuleLine(line[1], False)) 120 state = 2 121 elif line[0] == "allow": 122 if state != 0: 123 entry.rulelines.append(RuleLine(line[1], True)) 124 state = 2 125 elif line[0] == "crawl-delay": 126 if state != 0: 127 # before trying to convert to int we need to make 128 # sure that robots.txt has valid syntax otherwise 129 # it will crash 130 if line[1].strip().isdigit(): 131 entry.delay = int(line[1]) 132 state = 2 133 elif line[0] == "request-rate": 134 if state != 0: 135 numbers = line[1].split('/') 136 # check if all values are sane 137 if (len(numbers) == 2 and numbers[0].strip().isdigit() 138 and numbers[1].strip().isdigit()): 139 req_rate = collections.namedtuple('req_rate', 140 'requests seconds') 141 entry.req_rate = req_rate 142 entry.req_rate.requests = int(numbers[0]) 143 entry.req_rate.seconds = int(numbers[1]) 144 state = 2 145 if state == 2: 146 self._add_entry(entry) 147 148 def can_fetch(self, useragent, url): 149 """using the parsed robots.txt decide if useragent can fetch url""" 150 if self.disallow_all: 151 return False 152 if self.allow_all: 153 return True 154 # Until the robots.txt file has been read or found not 155 # to exist, we must assume that no url is allowable. 156 # This prevents false positives when a user erroneously 157 # calls can_fetch() before calling read(). 158 if not self.last_checked: 159 return False 160 # search for given user agent matches 161 # the first match counts 162 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) 163 url = urllib.parse.urlunparse(('','',parsed_url.path, 164 parsed_url.params,parsed_url.query, parsed_url.fragment)) 165 url = urllib.parse.quote(url) 166 if not url: 167 url = "/" 168 for entry in self.entries: 169 if entry.applies_to(useragent): 170 return entry.allowance(url) 171 # try the default entry last 172 if self.default_entry: 173 return self.default_entry.allowance(url) 174 # agent not found ==> access granted 175 return True 176 177 def crawl_delay(self, useragent): 178 if not self.mtime(): 179 return None 180 for entry in self.entries: 181 if entry.applies_to(useragent): 182 return entry.delay 183 return self.default_entry.delay 184 185 def request_rate(self, useragent): 186 if not self.mtime(): 187 return None 188 for entry in self.entries: 189 if entry.applies_to(useragent): 190 return entry.req_rate 191 return self.default_entry.req_rate 192 193 def __str__(self): 194 return ''.join([str(entry) + "\n" for entry in self.entries]) 195 196 197 class RuleLine: 198 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 199 (allowance==False) followed by a path.""" 200 def __init__(self, path, allowance): 201 if path == '' and not allowance: 202 # an empty value means allow all 203 allowance = True 204 path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) 205 self.path = urllib.parse.quote(path) 206 self.allowance = allowance 207 208 def applies_to(self, filename): 209 return self.path == "*" or filename.startswith(self.path) 210 211 def __str__(self): 212 return ("Allow" if self.allowance else "Disallow") + ": " + self.path 213 214 215 class Entry: 216 """An entry has one or more user-agents and zero or more rulelines""" 217 def __init__(self): 218 self.useragents = [] 219 self.rulelines = [] 220 self.delay = None 221 self.req_rate = None 222 223 def __str__(self): 224 ret = [] 225 for agent in self.useragents: 226 ret.extend(["User-agent: ", agent, "\n"]) 227 for line in self.rulelines: 228 ret.extend([str(line), "\n"]) 229 return ''.join(ret) 230 231 def applies_to(self, useragent): 232 """check if this entry applies to the specified agent""" 233 # split the name token and make it lower case 234 useragent = useragent.split("/")[0].lower() 235 for agent in self.useragents: 236 if agent == '*': 237 # we have the catch-all agent 238 return True 239 agent = agent.lower() 240 if agent in useragent: 241 return True 242 return False 243 244 def allowance(self, filename): 245 """Preconditions: 246 - our agent applies to this entry 247 - filename is URL decoded""" 248 for line in self.rulelines: 249 if line.applies_to(filename): 250 return line.allowance 251 return True 252