Home | History | Annotate | Download | only in utils
      1 #!/usr/bin/env python
      2 """Spider to try and find bugs in the parser. Requires httplib2 and elementtree
      3 
      4 usage:
      5 import spider
      6 s = spider.Spider()
      7 s.spider("http://www.google.com", maxURLs=100)
      8 """
      9 
     10 import urllib.request, urllib.error, urllib.parse
     11 import urllib.robotparser
     12 import md5
     13 
     14 import httplib2
     15 
     16 import html5lib
     17 from html5lib.treebuilders import etree
     18 
     19 class Spider(object):
     20     def __init__(self):
     21         self.unvisitedURLs = set()
     22         self.visitedURLs = set()
     23         self.buggyURLs=set()
     24         self.robotParser = urllib.robotparser.RobotFileParser()
     25         self.contentDigest = {}
     26         self.http = httplib2.Http(".cache")
     27 
     28     def run(self, initialURL, maxURLs=1000):
     29         urlNumber = 0
     30         self.visitedURLs.add(initialURL)
     31         content = self.loadURL(initialURL)
     32         while maxURLs is None or urlNumber < maxURLs:
     33             if content is not None:
     34                 self.parse(content)
     35                 urlNumber += 1
     36             if not self.unvisitedURLs:
     37                 break
     38             content = self.loadURL(self.unvisitedURLs.pop())
     39 
     40     def parse(self, content):
     41         failed = False
     42         p = html5lib.HTMLParser(tree=etree.TreeBuilder)
     43         try:
     44             tree = p.parse(content)
     45         except:
     46             self.buggyURLs.add(self.currentURL)
     47             failed = True
     48             print("BUGGY:", self.currentURL)
     49         self.visitedURLs.add(self.currentURL)
     50         if not failed:
     51             self.updateURLs(tree)
     52 
     53     def loadURL(self, url):
     54         resp, content = self.http.request(url, "GET")
     55         self.currentURL = url
     56         digest = md5.md5(content).hexdigest()
     57         if digest in self.contentDigest:
     58             content = None
     59             self.visitedURLs.add(url)
     60         else:
     61             self.contentDigest[digest] = url
     62 
     63         if resp['status'] != "200":
     64             content = None
     65 
     66         return content
     67 
     68     def updateURLs(self, tree):
     69         """Take all the links in the current document, extract the URLs and
     70         update the list of visited and unvisited URLs according to whether we
     71         have seen them before or not"""
     72         urls = set()
     73         #Remove all links we have already visited
     74         for link in tree.findall(".//a"):
     75                 try:
     76                     url = urllib.parse.urldefrag(link.attrib['href'])[0]
     77                     if (url and url not in self.unvisitedURLs and url
     78                         not in self.visitedURLs):
     79                         urls.add(url)
     80                 except KeyError:
     81                     pass
     82 
     83         #Remove all non-http URLs and a dd a sutiable base URL where that is
     84         #missing
     85         newUrls = set()
     86         for url in urls:
     87             splitURL = list(urllib.parse.urlsplit(url))
     88             if splitURL[0] != "http":
     89                 continue
     90             if splitURL[1] == "":
     91                 splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
     92             newUrls.add(urllib.parse.urlunsplit(splitURL))
     93         urls = newUrls
     94 
     95         responseHeaders = {}
     96         #Now we want to find the content types of the links we haven't visited
     97         for url in urls:
     98             try:
     99                 resp, content = self.http.request(url, "HEAD")
    100                 responseHeaders[url] = resp
    101             except AttributeError as KeyError:
    102                 #Don't know why this happens
    103                 pass
    104 
    105 
    106         #Remove links not of content-type html or pages not found
    107         #XXX - need to deal with other status codes?
    108         toVisit = set([url for url in urls if url in responseHeaders and
    109                       "html" in responseHeaders[url]['content-type'] and
    110                       responseHeaders[url]['status'] == "200"])
    111 
    112         #Now check we are allowed to spider the page
    113         for url in toVisit:
    114             robotURL = list(urllib.parse.urlsplit(url)[:2])
    115             robotURL.extend(["robots.txt", "", ""])
    116             robotURL = urllib.parse.urlunsplit(robotURL)
    117             self.robotParser.set_url(robotURL)
    118             if not self.robotParser.can_fetch("*", url):
    119                 toVisit.remove(url)
    120 
    121         self.visitedURLs.update(urls)
    122         self.unvisitedURLs.update(toVisit)
    123