1 """ robotparser.py 2 3 Copyright (C) 2000 Bastian Kleineidam 4 5 You can choose between two licenses when using this package: 6 1) GNU GPLv2 7 2) PSF license for Python 2.2 8 9 The robots.txt Exclusion Protocol is implemented as specified in 10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html 11 """ 12 import urlparse 13 import urllib 14 15 __all__ = ["RobotFileParser"] 16 17 18 class RobotFileParser: 19 """ This class provides a set of methods to read, parse and answer 20 questions about a single robots.txt file. 21 22 """ 23 24 def __init__(self, url=''): 25 self.entries = [] 26 self.default_entry = None 27 self.disallow_all = False 28 self.allow_all = False 29 self.set_url(url) 30 self.last_checked = 0 31 32 def mtime(self): 33 """Returns the time the robots.txt file was last fetched. 34 35 This is useful for long-running web spiders that need to 36 check for new robots.txt files periodically. 37 38 """ 39 return self.last_checked 40 41 def modified(self): 42 """Sets the time the robots.txt file was last fetched to the 43 current time. 44 45 """ 46 import time 47 self.last_checked = time.time() 48 49 def set_url(self, url): 50 """Sets the URL referring to a robots.txt file.""" 51 self.url = url 52 self.host, self.path = urlparse.urlparse(url)[1:3] 53 54 def read(self): 55 """Reads the robots.txt URL and feeds it to the parser.""" 56 opener = URLopener() 57 f = opener.open(self.url) 58 lines = [line.strip() for line in f] 59 f.close() 60 self.errcode = opener.errcode 61 if self.errcode in (401, 403): 62 self.disallow_all = True 63 elif self.errcode >= 400: 64 self.allow_all = True 65 elif self.errcode == 200 and lines: 66 self.parse(lines) 67 68 def _add_entry(self, entry): 69 if "*" in entry.useragents: 70 # the default entry is considered last 71 if self.default_entry is None: 72 # the first default entry wins 73 self.default_entry = entry 74 else: 75 self.entries.append(entry) 76 77 def parse(self, lines): 78 """parse the input lines from a robots.txt file. 79 We allow that a user-agent: line is not preceded by 80 one or more blank lines.""" 81 # states: 82 # 0: start state 83 # 1: saw user-agent line 84 # 2: saw an allow or disallow line 85 state = 0 86 linenumber = 0 87 entry = Entry() 88 89 for line in lines: 90 linenumber += 1 91 if not line: 92 if state == 1: 93 entry = Entry() 94 state = 0 95 elif state == 2: 96 self._add_entry(entry) 97 entry = Entry() 98 state = 0 99 # remove optional comment and strip line 100 i = line.find('#') 101 if i >= 0: 102 line = line[:i] 103 line = line.strip() 104 if not line: 105 continue 106 line = line.split(':', 1) 107 if len(line) == 2: 108 line[0] = line[0].strip().lower() 109 line[1] = urllib.unquote(line[1].strip()) 110 if line[0] == "user-agent": 111 if state == 2: 112 self._add_entry(entry) 113 entry = Entry() 114 entry.useragents.append(line[1]) 115 state = 1 116 elif line[0] == "disallow": 117 if state != 0: 118 entry.rulelines.append(RuleLine(line[1], False)) 119 state = 2 120 elif line[0] == "allow": 121 if state != 0: 122 entry.rulelines.append(RuleLine(line[1], True)) 123 state = 2 124 if state == 2: 125 self._add_entry(entry) 126 127 128 def can_fetch(self, useragent, url): 129 """using the parsed robots.txt decide if useragent can fetch url""" 130 if self.disallow_all: 131 return False 132 if self.allow_all: 133 return True 134 # search for given user agent matches 135 # the first match counts 136 parsed_url = urlparse.urlparse(urllib.unquote(url)) 137 url = urlparse.urlunparse(('', '', parsed_url.path, 138 parsed_url.params, parsed_url.query, parsed_url.fragment)) 139 url = urllib.quote(url) 140 if not url: 141 url = "/" 142 for entry in self.entries: 143 if entry.applies_to(useragent): 144 return entry.allowance(url) 145 # try the default entry last 146 if self.default_entry: 147 return self.default_entry.allowance(url) 148 # agent not found ==> access granted 149 return True 150 151 152 def __str__(self): 153 return ''.join([str(entry) + "\n" for entry in self.entries]) 154 155 156 class RuleLine: 157 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 158 (allowance==False) followed by a path.""" 159 def __init__(self, path, allowance): 160 if path == '' and not allowance: 161 # an empty value means allow all 162 allowance = True 163 self.path = urllib.quote(path) 164 self.allowance = allowance 165 166 def applies_to(self, filename): 167 return self.path == "*" or filename.startswith(self.path) 168 169 def __str__(self): 170 return (self.allowance and "Allow" or "Disallow") + ": " + self.path 171 172 173 class Entry: 174 """An entry has one or more user-agents and zero or more rulelines""" 175 def __init__(self): 176 self.useragents = [] 177 self.rulelines = [] 178 179 def __str__(self): 180 ret = [] 181 for agent in self.useragents: 182 ret.extend(["User-agent: ", agent, "\n"]) 183 for line in self.rulelines: 184 ret.extend([str(line), "\n"]) 185 return ''.join(ret) 186 187 def applies_to(self, useragent): 188 """check if this entry applies to the specified agent""" 189 # split the name token and make it lower case 190 useragent = useragent.split("/")[0].lower() 191 for agent in self.useragents: 192 if agent == '*': 193 # we have the catch-all agent 194 return True 195 agent = agent.lower() 196 if agent in useragent: 197 return True 198 return False 199 200 def allowance(self, filename): 201 """Preconditions: 202 - our agent applies to this entry 203 - filename is URL decoded""" 204 for line in self.rulelines: 205 if line.applies_to(filename): 206 return line.allowance 207 return True 208 209 class URLopener(urllib.FancyURLopener): 210 def __init__(self, *args): 211 urllib.FancyURLopener.__init__(self, *args) 212 self.errcode = 200 213 214 def prompt_user_passwd(self, host, realm): 215 ## If robots.txt file is accessible only with a password, 216 ## we act as if the file wasn't there. 217 return None, None 218 219 def http_error_default(self, url, fp, errcode, errmsg, headers): 220 self.errcode = errcode 221 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, 222 errmsg, headers) 223