Home | History | Annotate | Download | only in webchecker
      1 #! /usr/bin/env python
      2 
      3 """A variant on webchecker that creates a mirror copy of a remote site."""
      4 
      5 __version__ = "$Revision$"
      6 
      7 import os
      8 import sys
      9 import urllib
     10 import getopt
     11 
     12 import webchecker
     13 
     14 # Extract real version number if necessary
     15 if __version__[0] == '$':
     16     _v = __version__.split()
     17     if len(_v) == 3:
     18         __version__ = _v[1]
     19 
     20 def main():
     21     verbose = webchecker.VERBOSE
     22     try:
     23         opts, args = getopt.getopt(sys.argv[1:], "qv")
     24     except getopt.error, msg:
     25         print msg
     26         print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
     27         return 2
     28     for o, a in opts:
     29         if o == "-q":
     30             verbose = 0
     31         if o == "-v":
     32             verbose = verbose + 1
     33     c = Sucker()
     34     c.setflags(verbose=verbose)
     35     c.urlopener.addheaders = [
     36             ('User-agent', 'websucker/%s' % __version__),
     37         ]
     38     for arg in args:
     39         print "Adding root", arg
     40         c.addroot(arg)
     41     print "Run..."
     42     c.run()
     43 
     44 class Sucker(webchecker.Checker):
     45 
     46     checkext = 0
     47     nonames = 1
     48 
     49     # SAM 11/13/99: in general, URLs are now URL pairs.
     50     # Since we've suppressed name anchor checking,
     51     # we can ignore the second dimension.
     52 
     53     def readhtml(self, url_pair):
     54         url = url_pair[0]
     55         text = None
     56         path = self.savefilename(url)
     57         try:
     58             f = open(path, "rb")
     59         except IOError:
     60             f = self.openpage(url_pair)
     61             if f:
     62                 info = f.info()
     63                 nurl = f.geturl()
     64                 if nurl != url:
     65                     url = nurl
     66                     path = self.savefilename(url)
     67                 text = f.read()
     68                 f.close()
     69                 self.savefile(text, path)
     70                 if not self.checkforhtml(info, url):
     71                     text = None
     72         else:
     73             if self.checkforhtml({}, url):
     74                 text = f.read()
     75             f.close()
     76         return text, url
     77 
     78     def savefile(self, text, path):
     79         dir, base = os.path.split(path)
     80         makedirs(dir)
     81         try:
     82             f = open(path, "wb")
     83             f.write(text)
     84             f.close()
     85             self.message("saved %s", path)
     86         except IOError, msg:
     87             self.message("didn't save %s: %s", path, str(msg))
     88 
     89     def savefilename(self, url):
     90         type, rest = urllib.splittype(url)
     91         host, path = urllib.splithost(rest)
     92         path = path.lstrip("/")
     93         user, host = urllib.splituser(host)
     94         host, port = urllib.splitnport(host)
     95         host = host.lower()
     96         if not path or path[-1] == "/":
     97             path = path + "index.html"
     98         if os.sep != "/":
     99             path = os.sep.join(path.split("/"))
    100         path = os.path.join(host, path)
    101         return path
    102 
    103 def makedirs(dir):
    104     if not dir:
    105         return
    106     if os.path.exists(dir):
    107         if not os.path.isdir(dir):
    108             try:
    109                 os.rename(dir, dir + ".bak")
    110                 os.mkdir(dir)
    111                 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
    112             except os.error:
    113                 pass
    114         return
    115     head, tail = os.path.split(dir)
    116     if not tail:
    117         print "Huh?  Don't know how to make dir", dir
    118         return
    119     makedirs(head)
    120     os.mkdir(dir, 0777)
    121 
    122 if __name__ == '__main__':
    123     sys.exit(main() or 0)
    124