Home | History | Annotate | Download | only in urllib
      1 """ robotparser.py
      2 
      3     Copyright (C) 2000  Bastian Kleineidam
      4 
      5     You can choose between two licenses when using this package:
      6     1) GNU GPLv2
      7     2) PSF license for Python 2.2
      8 
      9     The robots.txt Exclusion Protocol is implemented as specified in
     10     http://www.robotstxt.org/norobots-rfc.txt
     11 """
     12 
     13 import collections
     14 import urllib.parse
     15 import urllib.request
     16 
     17 __all__ = ["RobotFileParser"]
     18 
     19 class RobotFileParser:
     20     """ This class provides a set of methods to read, parse and answer
     21     questions about a single robots.txt file.
     22 
     23     """
     24 
     25     def __init__(self, url=''):
     26         self.entries = []
     27         self.default_entry = None
     28         self.disallow_all = False
     29         self.allow_all = False
     30         self.set_url(url)
     31         self.last_checked = 0
     32 
     33     def mtime(self):
     34         """Returns the time the robots.txt file was last fetched.
     35 
     36         This is useful for long-running web spiders that need to
     37         check for new robots.txt files periodically.
     38 
     39         """
     40         return self.last_checked
     41 
     42     def modified(self):
     43         """Sets the time the robots.txt file was last fetched to the
     44         current time.
     45 
     46         """
     47         import time
     48         self.last_checked = time.time()
     49 
     50     def set_url(self, url):
     51         """Sets the URL referring to a robots.txt file."""
     52         self.url = url
     53         self.host, self.path = urllib.parse.urlparse(url)[1:3]
     54 
     55     def read(self):
     56         """Reads the robots.txt URL and feeds it to the parser."""
     57         try:
     58             f = urllib.request.urlopen(self.url)
     59         except urllib.error.HTTPError as err:
     60             if err.code in (401, 403):
     61                 self.disallow_all = True
     62             elif err.code >= 400 and err.code < 500:
     63                 self.allow_all = True
     64         else:
     65             raw = f.read()
     66             self.parse(raw.decode("utf-8").splitlines())
     67 
     68     def _add_entry(self, entry):
     69         if "*" in entry.useragents:
     70             # the default entry is considered last
     71             if self.default_entry is None:
     72                 # the first default entry wins
     73                 self.default_entry = entry
     74         else:
     75             self.entries.append(entry)
     76 
     77     def parse(self, lines):
     78         """Parse the input lines from a robots.txt file.
     79 
     80         We allow that a user-agent: line is not preceded by
     81         one or more blank lines.
     82         """
     83         # states:
     84         #   0: start state
     85         #   1: saw user-agent line
     86         #   2: saw an allow or disallow line
     87         state = 0
     88         entry = Entry()
     89 
     90         self.modified()
     91         for line in lines:
     92             if not line:
     93                 if state == 1:
     94                     entry = Entry()
     95                     state = 0
     96                 elif state == 2:
     97                     self._add_entry(entry)
     98                     entry = Entry()
     99                     state = 0
    100             # remove optional comment and strip line
    101             i = line.find('#')
    102             if i >= 0:
    103                 line = line[:i]
    104             line = line.strip()
    105             if not line:
    106                 continue
    107             line = line.split(':', 1)
    108             if len(line) == 2:
    109                 line[0] = line[0].strip().lower()
    110                 line[1] = urllib.parse.unquote(line[1].strip())
    111                 if line[0] == "user-agent":
    112                     if state == 2:
    113                         self._add_entry(entry)
    114                         entry = Entry()
    115                     entry.useragents.append(line[1])
    116                     state = 1
    117                 elif line[0] == "disallow":
    118                     if state != 0:
    119                         entry.rulelines.append(RuleLine(line[1], False))
    120                         state = 2
    121                 elif line[0] == "allow":
    122                     if state != 0:
    123                         entry.rulelines.append(RuleLine(line[1], True))
    124                         state = 2
    125                 elif line[0] == "crawl-delay":
    126                     if state != 0:
    127                         # before trying to convert to int we need to make
    128                         # sure that robots.txt has valid syntax otherwise
    129                         # it will crash
    130                         if line[1].strip().isdigit():
    131                             entry.delay = int(line[1])
    132                         state = 2
    133                 elif line[0] == "request-rate":
    134                     if state != 0:
    135                         numbers = line[1].split('/')
    136                         # check if all values are sane
    137                         if (len(numbers) == 2 and numbers[0].strip().isdigit()
    138                             and numbers[1].strip().isdigit()):
    139                             req_rate = collections.namedtuple('req_rate',
    140                                                               'requests seconds')
    141                             entry.req_rate = req_rate
    142                             entry.req_rate.requests = int(numbers[0])
    143                             entry.req_rate.seconds = int(numbers[1])
    144                         state = 2
    145         if state == 2:
    146             self._add_entry(entry)
    147 
    148     def can_fetch(self, useragent, url):
    149         """using the parsed robots.txt decide if useragent can fetch url"""
    150         if self.disallow_all:
    151             return False
    152         if self.allow_all:
    153             return True
    154         # Until the robots.txt file has been read or found not
    155         # to exist, we must assume that no url is allowable.
    156         # This prevents false positives when a user erroneously
    157         # calls can_fetch() before calling read().
    158         if not self.last_checked:
    159             return False
    160         # search for given user agent matches
    161         # the first match counts
    162         parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
    163         url = urllib.parse.urlunparse(('','',parsed_url.path,
    164             parsed_url.params,parsed_url.query, parsed_url.fragment))
    165         url = urllib.parse.quote(url)
    166         if not url:
    167             url = "/"
    168         for entry in self.entries:
    169             if entry.applies_to(useragent):
    170                 return entry.allowance(url)
    171         # try the default entry last
    172         if self.default_entry:
    173             return self.default_entry.allowance(url)
    174         # agent not found ==> access granted
    175         return True
    176 
    177     def crawl_delay(self, useragent):
    178         if not self.mtime():
    179             return None
    180         for entry in self.entries:
    181             if entry.applies_to(useragent):
    182                 return entry.delay
    183         return self.default_entry.delay
    184 
    185     def request_rate(self, useragent):
    186         if not self.mtime():
    187             return None
    188         for entry in self.entries:
    189             if entry.applies_to(useragent):
    190                 return entry.req_rate
    191         return self.default_entry.req_rate
    192 
    193     def __str__(self):
    194         return ''.join([str(entry) + "\n" for entry in self.entries])
    195 
    196 
    197 class RuleLine:
    198     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
    199        (allowance==False) followed by a path."""
    200     def __init__(self, path, allowance):
    201         if path == '' and not allowance:
    202             # an empty value means allow all
    203             allowance = True
    204         path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
    205         self.path = urllib.parse.quote(path)
    206         self.allowance = allowance
    207 
    208     def applies_to(self, filename):
    209         return self.path == "*" or filename.startswith(self.path)
    210 
    211     def __str__(self):
    212         return ("Allow" if self.allowance else "Disallow") + ": " + self.path
    213 
    214 
    215 class Entry:
    216     """An entry has one or more user-agents and zero or more rulelines"""
    217     def __init__(self):
    218         self.useragents = []
    219         self.rulelines = []
    220         self.delay = None
    221         self.req_rate = None
    222 
    223     def __str__(self):
    224         ret = []
    225         for agent in self.useragents:
    226             ret.extend(["User-agent: ", agent, "\n"])
    227         for line in self.rulelines:
    228             ret.extend([str(line), "\n"])
    229         return ''.join(ret)
    230 
    231     def applies_to(self, useragent):
    232         """check if this entry applies to the specified agent"""
    233         # split the name token and make it lower case
    234         useragent = useragent.split("/")[0].lower()
    235         for agent in self.useragents:
    236             if agent == '*':
    237                 # we have the catch-all agent
    238                 return True
    239             agent = agent.lower()
    240             if agent in useragent:
    241                 return True
    242         return False
    243 
    244     def allowance(self, filename):
    245         """Preconditions:
    246         - our agent applies to this entry
    247         - filename is URL decoded"""
    248         for line in self.rulelines:
    249             if line.applies_to(filename):
    250                 return line.allowance
    251         return True
    252