Home | History | Annotate | Download | only in Lib
      1 """ robotparser.py
      2 
      3     Copyright (C) 2000  Bastian Kleineidam
      4 
      5     You can choose between two licenses when using this package:
      6     1) GNU GPLv2
      7     2) PSF license for Python 2.2
      8 
      9     The robots.txt Exclusion Protocol is implemented as specified in
     10     http://www.robotstxt.org/norobots-rfc.txt
     11 
     12 """
     13 import urlparse
     14 import urllib
     15 
     16 __all__ = ["RobotFileParser"]
     17 
     18 
     19 class RobotFileParser:
     20     """ This class provides a set of methods to read, parse and answer
     21     questions about a single robots.txt file.
     22 
     23     """
     24 
     25     def __init__(self, url=''):
     26         self.entries = []
     27         self.default_entry = None
     28         self.disallow_all = False
     29         self.allow_all = False
     30         self.set_url(url)
     31         self.last_checked = 0
     32 
     33     def mtime(self):
     34         """Returns the time the robots.txt file was last fetched.
     35 
     36         This is useful for long-running web spiders that need to
     37         check for new robots.txt files periodically.
     38 
     39         """
     40         return self.last_checked
     41 
     42     def modified(self):
     43         """Sets the time the robots.txt file was last fetched to the
     44         current time.
     45 
     46         """
     47         import time
     48         self.last_checked = time.time()
     49 
     50     def set_url(self, url):
     51         """Sets the URL referring to a robots.txt file."""
     52         self.url = url
     53         self.host, self.path = urlparse.urlparse(url)[1:3]
     54 
     55     def read(self):
     56         """Reads the robots.txt URL and feeds it to the parser."""
     57         opener = URLopener()
     58         f = opener.open(self.url)
     59         lines = [line.strip() for line in f]
     60         f.close()
     61         self.errcode = opener.errcode
     62         if self.errcode in (401, 403):
     63             self.disallow_all = True
     64         elif self.errcode >= 400 and self.errcode < 500:
     65             self.allow_all = True
     66         elif self.errcode == 200 and lines:
     67             self.parse(lines)
     68 
     69     def _add_entry(self, entry):
     70         if "*" in entry.useragents:
     71             # the default entry is considered last
     72             if self.default_entry is None:
     73                 # the first default entry wins
     74                 self.default_entry = entry
     75         else:
     76             self.entries.append(entry)
     77 
     78     def parse(self, lines):
     79         """parse the input lines from a robots.txt file.
     80            We allow that a user-agent: line is not preceded by
     81            one or more blank lines."""
     82         # states:
     83         #   0: start state
     84         #   1: saw user-agent line
     85         #   2: saw an allow or disallow line
     86         state = 0
     87         linenumber = 0
     88         entry = Entry()
     89 
     90         self.modified()
     91         for line in lines:
     92             linenumber += 1
     93             if not line:
     94                 if state == 1:
     95                     entry = Entry()
     96                     state = 0
     97                 elif state == 2:
     98                     self._add_entry(entry)
     99                     entry = Entry()
    100                     state = 0
    101             # remove optional comment and strip line
    102             i = line.find('#')
    103             if i >= 0:
    104                 line = line[:i]
    105             line = line.strip()
    106             if not line:
    107                 continue
    108             line = line.split(':', 1)
    109             if len(line) == 2:
    110                 line[0] = line[0].strip().lower()
    111                 line[1] = urllib.unquote(line[1].strip())
    112                 if line[0] == "user-agent":
    113                     if state == 2:
    114                         self._add_entry(entry)
    115                         entry = Entry()
    116                     entry.useragents.append(line[1])
    117                     state = 1
    118                 elif line[0] == "disallow":
    119                     if state != 0:
    120                         entry.rulelines.append(RuleLine(line[1], False))
    121                         state = 2
    122                 elif line[0] == "allow":
    123                     if state != 0:
    124                         entry.rulelines.append(RuleLine(line[1], True))
    125                         state = 2
    126         if state == 2:
    127             self._add_entry(entry)
    128 
    129 
    130     def can_fetch(self, useragent, url):
    131         """using the parsed robots.txt decide if useragent can fetch url"""
    132         if self.disallow_all:
    133             return False
    134         if self.allow_all:
    135             return True
    136 
    137         # Until the robots.txt file has been read or found not
    138         # to exist, we must assume that no url is allowable.
    139         # This prevents false positives when a user erroneously
    140         # calls can_fetch() before calling read().
    141         if not self.last_checked:
    142             return False
    143 
    144         # search for given user agent matches
    145         # the first match counts
    146         parsed_url = urlparse.urlparse(urllib.unquote(url))
    147         url = urlparse.urlunparse(('', '', parsed_url.path,
    148             parsed_url.params, parsed_url.query, parsed_url.fragment))
    149         url = urllib.quote(url)
    150         if not url:
    151             url = "/"
    152         for entry in self.entries:
    153             if entry.applies_to(useragent):
    154                 return entry.allowance(url)
    155         # try the default entry last
    156         if self.default_entry:
    157             return self.default_entry.allowance(url)
    158         # agent not found ==> access granted
    159         return True
    160 
    161 
    162     def __str__(self):
    163         return ''.join([str(entry) + "\n" for entry in self.entries])
    164 
    165 
    166 class RuleLine:
    167     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
    168        (allowance==False) followed by a path."""
    169     def __init__(self, path, allowance):
    170         if path == '' and not allowance:
    171             # an empty value means allow all
    172             allowance = True
    173         path = urlparse.urlunparse(urlparse.urlparse(path))
    174         self.path = urllib.quote(path)
    175         self.allowance = allowance
    176 
    177     def applies_to(self, filename):
    178         return self.path == "*" or filename.startswith(self.path)
    179 
    180     def __str__(self):
    181         return (self.allowance and "Allow" or "Disallow") + ": " + self.path
    182 
    183 
    184 class Entry:
    185     """An entry has one or more user-agents and zero or more rulelines"""
    186     def __init__(self):
    187         self.useragents = []
    188         self.rulelines = []
    189 
    190     def __str__(self):
    191         ret = []
    192         for agent in self.useragents:
    193             ret.extend(["User-agent: ", agent, "\n"])
    194         for line in self.rulelines:
    195             ret.extend([str(line), "\n"])
    196         return ''.join(ret)
    197 
    198     def applies_to(self, useragent):
    199         """check if this entry applies to the specified agent"""
    200         # split the name token and make it lower case
    201         useragent = useragent.split("/")[0].lower()
    202         for agent in self.useragents:
    203             if agent == '*':
    204                 # we have the catch-all agent
    205                 return True
    206             agent = agent.lower()
    207             if agent in useragent:
    208                 return True
    209         return False
    210 
    211     def allowance(self, filename):
    212         """Preconditions:
    213         - our agent applies to this entry
    214         - filename is URL decoded"""
    215         for line in self.rulelines:
    216             if line.applies_to(filename):
    217                 return line.allowance
    218         return True
    219 
    220 class URLopener(urllib.FancyURLopener):
    221     def __init__(self, *args):
    222         urllib.FancyURLopener.__init__(self, *args)
    223         self.errcode = 200
    224 
    225     def prompt_user_passwd(self, host, realm):
    226         ## If robots.txt file is accessible only with a password,
    227         ## we act as if the file wasn't there.
    228         return None, None
    229 
    230     def http_error_default(self, url, fp, errcode, errmsg, headers):
    231         self.errcode = errcode
    232         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
    233                                                         errmsg, headers)
    234