1 #! /usr/bin/env python 2 3 """A variant on webchecker that creates a mirror copy of a remote site.""" 4 5 __version__ = "$Revision$" 6 7 import os 8 import sys 9 import urllib 10 import getopt 11 12 import webchecker 13 14 # Extract real version number if necessary 15 if __version__[0] == '$': 16 _v = __version__.split() 17 if len(_v) == 3: 18 __version__ = _v[1] 19 20 def main(): 21 verbose = webchecker.VERBOSE 22 try: 23 opts, args = getopt.getopt(sys.argv[1:], "qv") 24 except getopt.error, msg: 25 print msg 26 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." 27 return 2 28 for o, a in opts: 29 if o == "-q": 30 verbose = 0 31 if o == "-v": 32 verbose = verbose + 1 33 c = Sucker() 34 c.setflags(verbose=verbose) 35 c.urlopener.addheaders = [ 36 ('User-agent', 'websucker/%s' % __version__), 37 ] 38 for arg in args: 39 print "Adding root", arg 40 c.addroot(arg) 41 print "Run..." 42 c.run() 43 44 class Sucker(webchecker.Checker): 45 46 checkext = 0 47 nonames = 1 48 49 # SAM 11/13/99: in general, URLs are now URL pairs. 50 # Since we've suppressed name anchor checking, 51 # we can ignore the second dimension. 52 53 def readhtml(self, url_pair): 54 url = url_pair[0] 55 text = None 56 path = self.savefilename(url) 57 try: 58 f = open(path, "rb") 59 except IOError: 60 f = self.openpage(url_pair) 61 if f: 62 info = f.info() 63 nurl = f.geturl() 64 if nurl != url: 65 url = nurl 66 path = self.savefilename(url) 67 text = f.read() 68 f.close() 69 self.savefile(text, path) 70 if not self.checkforhtml(info, url): 71 text = None 72 else: 73 if self.checkforhtml({}, url): 74 text = f.read() 75 f.close() 76 return text, url 77 78 def savefile(self, text, path): 79 dir, base = os.path.split(path) 80 makedirs(dir) 81 try: 82 f = open(path, "wb") 83 f.write(text) 84 f.close() 85 self.message("saved %s", path) 86 except IOError, msg: 87 self.message("didn't save %s: %s", path, str(msg)) 88 89 def savefilename(self, url): 90 type, rest = urllib.splittype(url) 91 host, path = urllib.splithost(rest) 92 path = path.lstrip("/") 93 user, host = urllib.splituser(host) 94 host, port = urllib.splitnport(host) 95 host = host.lower() 96 if not path or path[-1] == "/": 97 path = path + "index.html" 98 if os.sep != "/": 99 path = os.sep.join(path.split("/")) 100 path = os.path.join(host, path) 101 return path 102 103 def makedirs(dir): 104 if not dir: 105 return 106 if os.path.exists(dir): 107 if not os.path.isdir(dir): 108 try: 109 os.rename(dir, dir + ".bak") 110 os.mkdir(dir) 111 os.rename(dir + ".bak", os.path.join(dir, "index.html")) 112 except os.error: 113 pass 114 return 115 head, tail = os.path.split(dir) 116 if not tail: 117 print "Huh? Don't know how to make dir", dir 118 return 119 makedirs(head) 120 os.mkdir(dir, 0777) 121 122 if __name__ == '__main__': 123 sys.exit(main() or 0) 124