Home | History | Annotate | Download | only in python2.7
      1 """ robotparser.py
      2 
      3     Copyright (C) 2000  Bastian Kleineidam
      4 
      5     You can choose between two licenses when using this package:
      6     1) GNU GPLv2
      7     2) PSF license for Python 2.2
      8 
      9     The robots.txt Exclusion Protocol is implemented as specified in
     10     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
     11 """
     12 import urlparse
     13 import urllib
     14 
     15 __all__ = ["RobotFileParser"]
     16 
     17 
     18 class RobotFileParser:
     19     """ This class provides a set of methods to read, parse and answer
     20     questions about a single robots.txt file.
     21 
     22     """
     23 
     24     def __init__(self, url=''):
     25         self.entries = []
     26         self.default_entry = None
     27         self.disallow_all = False
     28         self.allow_all = False
     29         self.set_url(url)
     30         self.last_checked = 0
     31 
     32     def mtime(self):
     33         """Returns the time the robots.txt file was last fetched.
     34 
     35         This is useful for long-running web spiders that need to
     36         check for new robots.txt files periodically.
     37 
     38         """
     39         return self.last_checked
     40 
     41     def modified(self):
     42         """Sets the time the robots.txt file was last fetched to the
     43         current time.
     44 
     45         """
     46         import time
     47         self.last_checked = time.time()
     48 
     49     def set_url(self, url):
     50         """Sets the URL referring to a robots.txt file."""
     51         self.url = url
     52         self.host, self.path = urlparse.urlparse(url)[1:3]
     53 
     54     def read(self):
     55         """Reads the robots.txt URL and feeds it to the parser."""
     56         opener = URLopener()
     57         f = opener.open(self.url)
     58         lines = [line.strip() for line in f]
     59         f.close()
     60         self.errcode = opener.errcode
     61         if self.errcode in (401, 403):
     62             self.disallow_all = True
     63         elif self.errcode >= 400:
     64             self.allow_all = True
     65         elif self.errcode == 200 and lines:
     66             self.parse(lines)
     67 
     68     def _add_entry(self, entry):
     69         if "*" in entry.useragents:
     70             # the default entry is considered last
     71             if self.default_entry is None:
     72                 # the first default entry wins
     73                 self.default_entry = entry
     74         else:
     75             self.entries.append(entry)
     76 
     77     def parse(self, lines):
     78         """parse the input lines from a robots.txt file.
     79            We allow that a user-agent: line is not preceded by
     80            one or more blank lines."""
     81         # states:
     82         #   0: start state
     83         #   1: saw user-agent line
     84         #   2: saw an allow or disallow line
     85         state = 0
     86         linenumber = 0
     87         entry = Entry()
     88 
     89         for line in lines:
     90             linenumber += 1
     91             if not line:
     92                 if state == 1:
     93                     entry = Entry()
     94                     state = 0
     95                 elif state == 2:
     96                     self._add_entry(entry)
     97                     entry = Entry()
     98                     state = 0
     99             # remove optional comment and strip line
    100             i = line.find('#')
    101             if i >= 0:
    102                 line = line[:i]
    103             line = line.strip()
    104             if not line:
    105                 continue
    106             line = line.split(':', 1)
    107             if len(line) == 2:
    108                 line[0] = line[0].strip().lower()
    109                 line[1] = urllib.unquote(line[1].strip())
    110                 if line[0] == "user-agent":
    111                     if state == 2:
    112                         self._add_entry(entry)
    113                         entry = Entry()
    114                     entry.useragents.append(line[1])
    115                     state = 1
    116                 elif line[0] == "disallow":
    117                     if state != 0:
    118                         entry.rulelines.append(RuleLine(line[1], False))
    119                         state = 2
    120                 elif line[0] == "allow":
    121                     if state != 0:
    122                         entry.rulelines.append(RuleLine(line[1], True))
    123                         state = 2
    124         if state == 2:
    125             self._add_entry(entry)
    126 
    127 
    128     def can_fetch(self, useragent, url):
    129         """using the parsed robots.txt decide if useragent can fetch url"""
    130         if self.disallow_all:
    131             return False
    132         if self.allow_all:
    133             return True
    134         # search for given user agent matches
    135         # the first match counts
    136         parsed_url = urlparse.urlparse(urllib.unquote(url))
    137         url = urlparse.urlunparse(('', '', parsed_url.path,
    138             parsed_url.params, parsed_url.query, parsed_url.fragment))
    139         url = urllib.quote(url)
    140         if not url:
    141             url = "/"
    142         for entry in self.entries:
    143             if entry.applies_to(useragent):
    144                 return entry.allowance(url)
    145         # try the default entry last
    146         if self.default_entry:
    147             return self.default_entry.allowance(url)
    148         # agent not found ==> access granted
    149         return True
    150 
    151 
    152     def __str__(self):
    153         return ''.join([str(entry) + "\n" for entry in self.entries])
    154 
    155 
    156 class RuleLine:
    157     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
    158        (allowance==False) followed by a path."""
    159     def __init__(self, path, allowance):
    160         if path == '' and not allowance:
    161             # an empty value means allow all
    162             allowance = True
    163         self.path = urllib.quote(path)
    164         self.allowance = allowance
    165 
    166     def applies_to(self, filename):
    167         return self.path == "*" or filename.startswith(self.path)
    168 
    169     def __str__(self):
    170         return (self.allowance and "Allow" or "Disallow") + ": " + self.path
    171 
    172 
    173 class Entry:
    174     """An entry has one or more user-agents and zero or more rulelines"""
    175     def __init__(self):
    176         self.useragents = []
    177         self.rulelines = []
    178 
    179     def __str__(self):
    180         ret = []
    181         for agent in self.useragents:
    182             ret.extend(["User-agent: ", agent, "\n"])
    183         for line in self.rulelines:
    184             ret.extend([str(line), "\n"])
    185         return ''.join(ret)
    186 
    187     def applies_to(self, useragent):
    188         """check if this entry applies to the specified agent"""
    189         # split the name token and make it lower case
    190         useragent = useragent.split("/")[0].lower()
    191         for agent in self.useragents:
    192             if agent == '*':
    193                 # we have the catch-all agent
    194                 return True
    195             agent = agent.lower()
    196             if agent in useragent:
    197                 return True
    198         return False
    199 
    200     def allowance(self, filename):
    201         """Preconditions:
    202         - our agent applies to this entry
    203         - filename is URL decoded"""
    204         for line in self.rulelines:
    205             if line.applies_to(filename):
    206                 return line.allowance
    207         return True
    208 
    209 class URLopener(urllib.FancyURLopener):
    210     def __init__(self, *args):
    211         urllib.FancyURLopener.__init__(self, *args)
    212         self.errcode = 200
    213 
    214     def prompt_user_passwd(self, host, realm):
    215         ## If robots.txt file is accessible only with a password,
    216         ## we act as if the file wasn't there.
    217         return None, None
    218 
    219     def http_error_default(self, url, fp, errcode, errmsg, headers):
    220         self.errcode = errcode
    221         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
    222                                                         errmsg, headers)
    223