Home | History | Annotate | Download | only in Lib
      1 """Open an arbitrary URL.
      2 
      3 See the following document for more info on URLs:
      4 "Names and Addresses, URIs, URLs, URNs, URCs", at
      5 http://www.w3.org/pub/WWW/Addressing/Overview.html
      6 
      7 See also the HTTP spec (from which the error codes are derived):
      8 "HTTP - Hypertext Transfer Protocol", at
      9 http://www.w3.org/pub/WWW/Protocols/
     10 
     11 Related standards and specs:
     12 - RFC1808: the "relative URL" spec. (authoritative status)
     13 - RFC1738 - the "URL standard". (authoritative status)
     14 - RFC1630 - the "URI spec". (informational status)
     15 
     16 The object returned by URLopener().open(file) will differ per
     17 protocol.  All you know is that is has methods read(), readline(),
     18 readlines(), fileno(), close() and info().  The read*(), fileno()
     19 and close() methods work like those of open files.
     20 The info() method returns a mimetools.Message object which can be
     21 used to query various info about the object, if available.
     22 (mimetools.Message objects are queried with the getheader() method.)
     23 """
     24 
     25 import string
     26 import socket
     27 import os
     28 import time
     29 import sys
     30 from urlparse import urljoin as basejoin
     31 
     32 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
     33            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
     34            "urlencode", "url2pathname", "pathname2url", "splittag",
     35            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
     36            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
     37            "splitnport", "splitquery", "splitattr", "splitvalue",
     38            "getproxies"]
     39 
     40 __version__ = '1.17'    # XXX This version is not always updated :-(

     41 
     42 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size

     43 
     44 # Helper for non-unix systems

     45 if os.name == 'nt':
     46     from nturl2path import url2pathname, pathname2url
     47 elif os.name == 'riscos':
     48     from rourl2path import url2pathname, pathname2url
     49 else:
     50     def url2pathname(pathname):
     51         """OS-specific conversion from a relative URL of the 'file' scheme
     52         to a file system path; not recommended for general use."""
     53         return unquote(pathname)
     54 
     55     def pathname2url(pathname):
     56         """OS-specific conversion from a file system path to a relative URL
     57         of the 'file' scheme; not recommended for general use."""
     58         return quote(pathname)
     59 
     60 # This really consists of two pieces:

     61 # (1) a class which handles opening of all sorts of URLs

     62 #     (plus assorted utilities etc.)

     63 # (2) a set of functions for parsing URLs

     64 # XXX Should these be separated out into different modules?

     65 
     66 
     67 # Shortcut for basic usage

     68 _urlopener = None
     69 def urlopen(url, data=None, proxies=None):
     70     """Create a file-like object for the specified URL to read from."""
     71     from warnings import warnpy3k
     72     warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
     73              "favor of urllib2.urlopen()", stacklevel=2)
     74 
     75     global _urlopener
     76     if proxies is not None:
     77         opener = FancyURLopener(proxies=proxies)
     78     elif not _urlopener:
     79         opener = FancyURLopener()
     80         _urlopener = opener
     81     else:
     82         opener = _urlopener
     83     if data is None:
     84         return opener.open(url)
     85     else:
     86         return opener.open(url, data)
     87 def urlretrieve(url, filename=None, reporthook=None, data=None):
     88     global _urlopener
     89     if not _urlopener:
     90         _urlopener = FancyURLopener()
     91     return _urlopener.retrieve(url, filename, reporthook, data)
     92 def urlcleanup():
     93     if _urlopener:
     94         _urlopener.cleanup()
     95     _safe_quoters.clear()
     96     ftpcache.clear()
     97 
     98 # check for SSL

     99 try:
    100     import ssl
    101 except:
    102     _have_ssl = False
    103 else:
    104     _have_ssl = True
    105 
    106 # exception raised when downloaded size does not match content-length

    107 class ContentTooShortError(IOError):
    108     def __init__(self, message, content):
    109         IOError.__init__(self, message)
    110         self.content = content
    111 
    112 ftpcache = {}
    113 class URLopener:
    114     """Class to open URLs.
    115     This is a class rather than just a subroutine because we may need
    116     more than one set of global protocol-specific options.
    117     Note -- this is a base class for those who don't want the
    118     automatic handling of errors type 302 (relocated) and 401
    119     (authorization needed)."""
    120 
    121     __tempfiles = None
    122 
    123     version = "Python-urllib/%s" % __version__
    124 
    125     # Constructor

    126     def __init__(self, proxies=None, **x509):
    127         if proxies is None:
    128             proxies = getproxies()
    129         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
    130         self.proxies = proxies
    131         self.key_file = x509.get('key_file')
    132         self.cert_file = x509.get('cert_file')
    133         self.addheaders = [('User-Agent', self.version)]
    134         self.__tempfiles = []
    135         self.__unlink = os.unlink # See cleanup()

    136         self.tempcache = None
    137         # Undocumented feature: if you assign {} to tempcache,

    138         # it is used to cache files retrieved with

    139         # self.retrieve().  This is not enabled by default

    140         # since it does not work for changing documents (and I

    141         # haven't got the logic to check expiration headers

    142         # yet).

    143         self.ftpcache = ftpcache
    144         # Undocumented feature: you can use a different

    145         # ftp cache by assigning to the .ftpcache member;

    146         # in case you want logically independent URL openers

    147         # XXX This is not threadsafe.  Bah.

    148 
    149     def __del__(self):
    150         self.close()
    151 
    152     def close(self):
    153         self.cleanup()
    154 
    155     def cleanup(self):
    156         # This code sometimes runs when the rest of this module

    157         # has already been deleted, so it can't use any globals

    158         # or import anything.

    159         if self.__tempfiles:
    160             for file in self.__tempfiles:
    161                 try:
    162                     self.__unlink(file)
    163                 except OSError:
    164                     pass
    165             del self.__tempfiles[:]
    166         if self.tempcache:
    167             self.tempcache.clear()
    168 
    169     def addheader(self, *args):
    170         """Add a header to be used by the HTTP interface only
    171         e.g. u.addheader('Accept', 'sound/basic')"""
    172         self.addheaders.append(args)
    173 
    174     # External interface

    175     def open(self, fullurl, data=None):
    176         """Use URLopener().open(file) instead of open(file, 'r')."""
    177         fullurl = unwrap(toBytes(fullurl))
    178         # percent encode url, fixing lame server errors for e.g, like space

    179         # within url paths.

    180         fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
    181         if self.tempcache and fullurl in self.tempcache:
    182             filename, headers = self.tempcache[fullurl]
    183             fp = open(filename, 'rb')
    184             return addinfourl(fp, headers, fullurl)
    185         urltype, url = splittype(fullurl)
    186         if not urltype:
    187             urltype = 'file'
    188         if urltype in self.proxies:
    189             proxy = self.proxies[urltype]
    190             urltype, proxyhost = splittype(proxy)
    191             host, selector = splithost(proxyhost)
    192             url = (host, fullurl) # Signal special case to open_*()

    193         else:
    194             proxy = None
    195         name = 'open_' + urltype
    196         self.type = urltype
    197         name = name.replace('-', '_')
    198         if not hasattr(self, name):
    199             if proxy:
    200                 return self.open_unknown_proxy(proxy, fullurl, data)
    201             else:
    202                 return self.open_unknown(fullurl, data)
    203         try:
    204             if data is None:
    205                 return getattr(self, name)(url)
    206             else:
    207                 return getattr(self, name)(url, data)
    208         except socket.error, msg:
    209             raise IOError, ('socket error', msg), sys.exc_info()[2]
    210 
    211     def open_unknown(self, fullurl, data=None):
    212         """Overridable interface to open unknown URL type."""
    213         type, url = splittype(fullurl)
    214         raise IOError, ('url error', 'unknown url type', type)
    215 
    216     def open_unknown_proxy(self, proxy, fullurl, data=None):
    217         """Overridable interface to open unknown URL type."""
    218         type, url = splittype(fullurl)
    219         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
    220 
    221     # External interface

    222     def retrieve(self, url, filename=None, reporthook=None, data=None):
    223         """retrieve(url) returns (filename, headers) for a local object
    224         or (tempfilename, headers) for a remote object."""
    225         url = unwrap(toBytes(url))
    226         if self.tempcache and url in self.tempcache:
    227             return self.tempcache[url]
    228         type, url1 = splittype(url)
    229         if filename is None and (not type or type == 'file'):
    230             try:
    231                 fp = self.open_local_file(url1)
    232                 hdrs = fp.info()
    233                 fp.close()
    234                 return url2pathname(splithost(url1)[1]), hdrs
    235             except IOError:
    236                 pass
    237         fp = self.open(url, data)
    238         try:
    239             headers = fp.info()
    240             if filename:
    241                 tfp = open(filename, 'wb')
    242             else:
    243                 import tempfile
    244                 garbage, path = splittype(url)
    245                 garbage, path = splithost(path or "")
    246                 path, garbage = splitquery(path or "")
    247                 path, garbage = splitattr(path or "")
    248                 suffix = os.path.splitext(path)[1]
    249                 (fd, filename) = tempfile.mkstemp(suffix)
    250                 self.__tempfiles.append(filename)
    251                 tfp = os.fdopen(fd, 'wb')
    252             try:
    253                 result = filename, headers
    254                 if self.tempcache is not None:
    255                     self.tempcache[url] = result
    256                 bs = 1024*8
    257                 size = -1
    258                 read = 0
    259                 blocknum = 0
    260                 if reporthook:
    261                     if "content-length" in headers:
    262                         size = int(headers["Content-Length"])
    263                     reporthook(blocknum, bs, size)
    264                 while 1:
    265                     block = fp.read(bs)
    266                     if block == "":
    267                         break
    268                     read += len(block)
    269                     tfp.write(block)
    270                     blocknum += 1
    271                     if reporthook:
    272                         reporthook(blocknum, bs, size)
    273             finally:
    274                 tfp.close()
    275         finally:
    276             fp.close()
    277 
    278         # raise exception if actual size does not match content-length header

    279         if size >= 0 and read < size:
    280             raise ContentTooShortError("retrieval incomplete: got only %i out "
    281                                        "of %i bytes" % (read, size), result)
    282 
    283         return result
    284 
    285     # Each method named open_<type> knows how to open that type of URL

    286 
    287     def open_http(self, url, data=None):
    288         """Use HTTP protocol."""
    289         import httplib
    290         user_passwd = None
    291         proxy_passwd= None
    292         if isinstance(url, str):
    293             host, selector = splithost(url)
    294             if host:
    295                 user_passwd, host = splituser(host)
    296                 host = unquote(host)
    297             realhost = host
    298         else:
    299             host, selector = url
    300             # check whether the proxy contains authorization information

    301             proxy_passwd, host = splituser(host)
    302             # now we proceed with the url we want to obtain

    303             urltype, rest = splittype(selector)
    304             url = rest
    305             user_passwd = None
    306             if urltype.lower() != 'http':
    307                 realhost = None
    308             else:
    309                 realhost, rest = splithost(rest)
    310                 if realhost:
    311                     user_passwd, realhost = splituser(realhost)
    312                 if user_passwd:
    313                     selector = "%s://%s%s" % (urltype, realhost, rest)
    314                 if proxy_bypass(realhost):
    315                     host = realhost
    316 
    317             #print "proxy via http:", host, selector

    318         if not host: raise IOError, ('http error', 'no host given')
    319 
    320         if proxy_passwd:
    321             import base64
    322             proxy_auth = base64.b64encode(proxy_passwd).strip()
    323         else:
    324             proxy_auth = None
    325 
    326         if user_passwd:
    327             import base64
    328             auth = base64.b64encode(user_passwd).strip()
    329         else:
    330             auth = None
    331         h = httplib.HTTP(host)
    332         if data is not None:
    333             h.putrequest('POST', selector)
    334             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
    335             h.putheader('Content-Length', '%d' % len(data))
    336         else:
    337             h.putrequest('GET', selector)
    338         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    339         if auth: h.putheader('Authorization', 'Basic %s' % auth)
    340         if realhost: h.putheader('Host', realhost)
    341         for args in self.addheaders: h.putheader(*args)
    342         h.endheaders(data)
    343         errcode, errmsg, headers = h.getreply()
    344         fp = h.getfile()
    345         if errcode == -1:
    346             if fp: fp.close()
    347             # something went wrong with the HTTP status line

    348             raise IOError, ('http protocol error', 0,
    349                             'got a bad status line', None)
    350         # According to RFC 2616, "2xx" code indicates that the client's

    351         # request was successfully received, understood, and accepted.

    352         if (200 <= errcode < 300):
    353             return addinfourl(fp, headers, "http:" + url, errcode)
    354         else:
    355             if data is None:
    356                 return self.http_error(url, fp, errcode, errmsg, headers)
    357             else:
    358                 return self.http_error(url, fp, errcode, errmsg, headers, data)
    359 
    360     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
    361         """Handle http errors.
    362         Derived class can override this, or provide specific handlers
    363         named http_error_DDD where DDD is the 3-digit error code."""
    364         # First check if there's a specific handler for this error

    365         name = 'http_error_%d' % errcode
    366         if hasattr(self, name):
    367             method = getattr(self, name)
    368             if data is None:
    369                 result = method(url, fp, errcode, errmsg, headers)
    370             else:
    371                 result = method(url, fp, errcode, errmsg, headers, data)
    372             if result: return result
    373         return self.http_error_default(url, fp, errcode, errmsg, headers)
    374 
    375     def http_error_default(self, url, fp, errcode, errmsg, headers):
    376         """Default error handler: close the connection and raise IOError."""
    377         void = fp.read()
    378         fp.close()
    379         raise IOError, ('http error', errcode, errmsg, headers)
    380 
    381     if _have_ssl:
    382         def open_https(self, url, data=None):
    383             """Use HTTPS protocol."""
    384 
    385             import httplib
    386             user_passwd = None
    387             proxy_passwd = None
    388             if isinstance(url, str):
    389                 host, selector = splithost(url)
    390                 if host:
    391                     user_passwd, host = splituser(host)
    392                     host = unquote(host)
    393                 realhost = host
    394             else:
    395                 host, selector = url
    396                 # here, we determine, whether the proxy contains authorization information

    397                 proxy_passwd, host = splituser(host)
    398                 urltype, rest = splittype(selector)
    399                 url = rest
    400                 user_passwd = None
    401                 if urltype.lower() != 'https':
    402                     realhost = None
    403                 else:
    404                     realhost, rest = splithost(rest)
    405                     if realhost:
    406                         user_passwd, realhost = splituser(realhost)
    407                     if user_passwd:
    408                         selector = "%s://%s%s" % (urltype, realhost, rest)
    409                 #print "proxy via https:", host, selector

    410             if not host: raise IOError, ('https error', 'no host given')
    411             if proxy_passwd:
    412                 import base64
    413                 proxy_auth = base64.b64encode(proxy_passwd).strip()
    414             else:
    415                 proxy_auth = None
    416             if user_passwd:
    417                 import base64
    418                 auth = base64.b64encode(user_passwd).strip()
    419             else:
    420                 auth = None
    421             h = httplib.HTTPS(host, 0,
    422                               key_file=self.key_file,
    423                               cert_file=self.cert_file)
    424             if data is not None:
    425                 h.putrequest('POST', selector)
    426                 h.putheader('Content-Type',
    427                             'application/x-www-form-urlencoded')
    428                 h.putheader('Content-Length', '%d' % len(data))
    429             else:
    430                 h.putrequest('GET', selector)
    431             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    432             if auth: h.putheader('Authorization', 'Basic %s' % auth)
    433             if realhost: h.putheader('Host', realhost)
    434             for args in self.addheaders: h.putheader(*args)
    435             h.endheaders(data)
    436             errcode, errmsg, headers = h.getreply()
    437             fp = h.getfile()
    438             if errcode == -1:
    439                 if fp: fp.close()
    440                 # something went wrong with the HTTP status line

    441                 raise IOError, ('http protocol error', 0,
    442                                 'got a bad status line', None)
    443             # According to RFC 2616, "2xx" code indicates that the client's

    444             # request was successfully received, understood, and accepted.

    445             if (200 <= errcode < 300):
    446                 return addinfourl(fp, headers, "https:" + url, errcode)
    447             else:
    448                 if data is None:
    449                     return self.http_error(url, fp, errcode, errmsg, headers)
    450                 else:
    451                     return self.http_error(url, fp, errcode, errmsg, headers,
    452                                            data)
    453 
    454     def open_file(self, url):
    455         """Use local file or FTP depending on form of URL."""
    456         if not isinstance(url, str):
    457             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
    458         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
    459             return self.open_ftp(url)
    460         else:
    461             return self.open_local_file(url)
    462 
    463     def open_local_file(self, url):
    464         """Use local file."""
    465         import mimetypes, mimetools, email.utils
    466         try:
    467             from cStringIO import StringIO
    468         except ImportError:
    469             from StringIO import StringIO
    470         host, file = splithost(url)
    471         localname = url2pathname(file)
    472         try:
    473             stats = os.stat(localname)
    474         except OSError, e:
    475             raise IOError(e.errno, e.strerror, e.filename)
    476         size = stats.st_size
    477         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
    478         mtype = mimetypes.guess_type(url)[0]
    479         headers = mimetools.Message(StringIO(
    480             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
    481             (mtype or 'text/plain', size, modified)))
    482         if not host:
    483             urlfile = file
    484             if file[:1] == '/':
    485                 urlfile = 'file://' + file
    486             return addinfourl(open(localname, 'rb'),
    487                               headers, urlfile)
    488         host, port = splitport(host)
    489         if not port \
    490            and socket.gethostbyname(host) in (localhost(), thishost()):
    491             urlfile = file
    492             if file[:1] == '/':
    493                 urlfile = 'file://' + file
    494             return addinfourl(open(localname, 'rb'),
    495                               headers, urlfile)
    496         raise IOError, ('local file error', 'not on local host')
    497 
    498     def open_ftp(self, url):
    499         """Use FTP protocol."""
    500         if not isinstance(url, str):
    501             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
    502         import mimetypes, mimetools
    503         try:
    504             from cStringIO import StringIO
    505         except ImportError:
    506             from StringIO import StringIO
    507         host, path = splithost(url)
    508         if not host: raise IOError, ('ftp error', 'no host given')
    509         host, port = splitport(host)
    510         user, host = splituser(host)
    511         if user: user, passwd = splitpasswd(user)
    512         else: passwd = None
    513         host = unquote(host)
    514         user = user or ''
    515         passwd = passwd or ''
    516         host = socket.gethostbyname(host)
    517         if not port:
    518             import ftplib
    519             port = ftplib.FTP_PORT
    520         else:
    521             port = int(port)
    522         path, attrs = splitattr(path)
    523         path = unquote(path)
    524         dirs = path.split('/')
    525         dirs, file = dirs[:-1], dirs[-1]
    526         if dirs and not dirs[0]: dirs = dirs[1:]
    527         if dirs and not dirs[0]: dirs[0] = '/'
    528         key = user, host, port, '/'.join(dirs)
    529         # XXX thread unsafe!

    530         if len(self.ftpcache) > MAXFTPCACHE:
    531             # Prune the cache, rather arbitrarily

    532             for k in self.ftpcache.keys():
    533                 if k != key:
    534                     v = self.ftpcache[k]
    535                     del self.ftpcache[k]
    536                     v.close()
    537         try:
    538             if not key in self.ftpcache:
    539                 self.ftpcache[key] = \
    540                     ftpwrapper(user, passwd, host, port, dirs)
    541             if not file: type = 'D'
    542             else: type = 'I'
    543             for attr in attrs:
    544                 attr, value = splitvalue(attr)
    545                 if attr.lower() == 'type' and \
    546                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
    547                     type = value.upper()
    548             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
    549             mtype = mimetypes.guess_type("ftp:" + url)[0]
    550             headers = ""
    551             if mtype:
    552                 headers += "Content-Type: %s\n" % mtype
    553             if retrlen is not None and retrlen >= 0:
    554                 headers += "Content-Length: %d\n" % retrlen
    555             headers = mimetools.Message(StringIO(headers))
    556             return addinfourl(fp, headers, "ftp:" + url)
    557         except ftperrors(), msg:
    558             raise IOError, ('ftp error', msg), sys.exc_info()[2]
    559 
    560     def open_data(self, url, data=None):
    561         """Use "data" URL."""
    562         if not isinstance(url, str):
    563             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
    564         # ignore POSTed data

    565         #

    566         # syntax of data URLs:

    567         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data

    568         # mediatype := [ type "/" subtype ] *( ";" parameter )

    569         # data      := *urlchar

    570         # parameter := attribute "=" value

    571         import mimetools
    572         try:
    573             from cStringIO import StringIO
    574         except ImportError:
    575             from StringIO import StringIO
    576         try:
    577             [type, data] = url.split(',', 1)
    578         except ValueError:
    579             raise IOError, ('data error', 'bad data URL')
    580         if not type:
    581             type = 'text/plain;charset=US-ASCII'
    582         semi = type.rfind(';')
    583         if semi >= 0 and '=' not in type[semi:]:
    584             encoding = type[semi+1:]
    585             type = type[:semi]
    586         else:
    587             encoding = ''
    588         msg = []
    589         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
    590                                             time.gmtime(time.time())))
    591         msg.append('Content-type: %s' % type)
    592         if encoding == 'base64':
    593             import base64
    594             data = base64.decodestring(data)
    595         else:
    596             data = unquote(data)
    597         msg.append('Content-Length: %d' % len(data))
    598         msg.append('')
    599         msg.append(data)
    600         msg = '\n'.join(msg)
    601         f = StringIO(msg)
    602         headers = mimetools.Message(f, 0)
    603         #f.fileno = None     # needed for addinfourl

    604         return addinfourl(f, headers, url)
    605 
    606 
    607 class FancyURLopener(URLopener):
    608     """Derived class with handlers for errors we can handle (perhaps)."""
    609 
    610     def __init__(self, *args, **kwargs):
    611         URLopener.__init__(self, *args, **kwargs)
    612         self.auth_cache = {}
    613         self.tries = 0
    614         self.maxtries = 10
    615 
    616     def http_error_default(self, url, fp, errcode, errmsg, headers):
    617         """Default error handling -- don't raise an exception."""
    618         return addinfourl(fp, headers, "http:" + url, errcode)
    619 
    620     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
    621         """Error 302 -- relocated (temporarily)."""
    622         self.tries += 1
    623         if self.maxtries and self.tries >= self.maxtries:
    624             if hasattr(self, "http_error_500"):
    625                 meth = self.http_error_500
    626             else:
    627                 meth = self.http_error_default
    628             self.tries = 0
    629             return meth(url, fp, 500,
    630                         "Internal Server Error: Redirect Recursion", headers)
    631         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
    632                                         data)
    633         self.tries = 0
    634         return result
    635 
    636     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
    637         if 'location' in headers:
    638             newurl = headers['location']
    639         elif 'uri' in headers:
    640             newurl = headers['uri']
    641         else:
    642             return
    643         void = fp.read()
    644         fp.close()
    645         # In case the server sent a relative URL, join with original:

    646         newurl = basejoin(self.type + ":" + url, newurl)
    647 
    648         # For security reasons we do not allow redirects to protocols

    649         # other than HTTP, HTTPS or FTP.

    650         newurl_lower = newurl.lower()
    651         if not (newurl_lower.startswith('http://') or
    652                 newurl_lower.startswith('https://') or
    653                 newurl_lower.startswith('ftp://')):
    654             raise IOError('redirect error', errcode,
    655                           errmsg + " - Redirection to url '%s' is not allowed" %
    656                           newurl,
    657                           headers)
    658 
    659         return self.open(newurl)
    660 
    661     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
    662         """Error 301 -- also relocated (permanently)."""
    663         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    664 
    665     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
    666         """Error 303 -- also relocated (essentially identical to 302)."""
    667         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    668 
    669     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
    670         """Error 307 -- relocated, but turn POST into error."""
    671         if data is None:
    672             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    673         else:
    674             return self.http_error_default(url, fp, errcode, errmsg, headers)
    675 
    676     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
    677         """Error 401 -- authentication required.
    678         This function supports Basic authentication only."""
    679         if not 'www-authenticate' in headers:
    680             URLopener.http_error_default(self, url, fp,
    681                                          errcode, errmsg, headers)
    682         stuff = headers['www-authenticate']
    683         import re
    684         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
    685         if not match:
    686             URLopener.http_error_default(self, url, fp,
    687                                          errcode, errmsg, headers)
    688         scheme, realm = match.groups()
    689         if scheme.lower() != 'basic':
    690             URLopener.http_error_default(self, url, fp,
    691                                          errcode, errmsg, headers)
    692         name = 'retry_' + self.type + '_basic_auth'
    693         if data is None:
    694             return getattr(self,name)(url, realm)
    695         else:
    696             return getattr(self,name)(url, realm, data)
    697 
    698     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
    699         """Error 407 -- proxy authentication required.
    700         This function supports Basic authentication only."""
    701         if not 'proxy-authenticate' in headers:
    702             URLopener.http_error_default(self, url, fp,
    703                                          errcode, errmsg, headers)
    704         stuff = headers['proxy-authenticate']
    705         import re
    706         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
    707         if not match:
    708             URLopener.http_error_default(self, url, fp,
    709                                          errcode, errmsg, headers)
    710         scheme, realm = match.groups()
    711         if scheme.lower() != 'basic':
    712             URLopener.http_error_default(self, url, fp,
    713                                          errcode, errmsg, headers)
    714         name = 'retry_proxy_' + self.type + '_basic_auth'
    715         if data is None:
    716             return getattr(self,name)(url, realm)
    717         else:
    718             return getattr(self,name)(url, realm, data)
    719 
    720     def retry_proxy_http_basic_auth(self, url, realm, data=None):
    721         host, selector = splithost(url)
    722         newurl = 'http://' + host + selector
    723         proxy = self.proxies['http']
    724         urltype, proxyhost = splittype(proxy)
    725         proxyhost, proxyselector = splithost(proxyhost)
    726         i = proxyhost.find('@') + 1
    727         proxyhost = proxyhost[i:]
    728         user, passwd = self.get_user_passwd(proxyhost, realm, i)
    729         if not (user or passwd): return None
    730         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
    731         self.proxies['http'] = 'http://' + proxyhost + proxyselector
    732         if data is None:
    733             return self.open(newurl)
    734         else:
    735             return self.open(newurl, data)
    736 
    737     def retry_proxy_https_basic_auth(self, url, realm, data=None):
    738         host, selector = splithost(url)
    739         newurl = 'https://' + host + selector
    740         proxy = self.proxies['https']
    741         urltype, proxyhost = splittype(proxy)
    742         proxyhost, proxyselector = splithost(proxyhost)
    743         i = proxyhost.find('@') + 1
    744         proxyhost = proxyhost[i:]
    745         user, passwd = self.get_user_passwd(proxyhost, realm, i)
    746         if not (user or passwd): return None
    747         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
    748         self.proxies['https'] = 'https://' + proxyhost + proxyselector
    749         if data is None:
    750             return self.open(newurl)
    751         else:
    752             return self.open(newurl, data)
    753 
    754     def retry_http_basic_auth(self, url, realm, data=None):
    755         host, selector = splithost(url)
    756         i = host.find('@') + 1
    757         host = host[i:]
    758         user, passwd = self.get_user_passwd(host, realm, i)
    759         if not (user or passwd): return None
    760         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
    761         newurl = 'http://' + host + selector
    762         if data is None:
    763             return self.open(newurl)
    764         else:
    765             return self.open(newurl, data)
    766 
    767     def retry_https_basic_auth(self, url, realm, data=None):
    768         host, selector = splithost(url)
    769         i = host.find('@') + 1
    770         host = host[i:]
    771         user, passwd = self.get_user_passwd(host, realm, i)
    772         if not (user or passwd): return None
    773         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
    774         newurl = 'https://' + host + selector
    775         if data is None:
    776             return self.open(newurl)
    777         else:
    778             return self.open(newurl, data)
    779 
    780     def get_user_passwd(self, host, realm, clear_cache=0):
    781         key = realm + '@' + host.lower()
    782         if key in self.auth_cache:
    783             if clear_cache:
    784                 del self.auth_cache[key]
    785             else:
    786                 return self.auth_cache[key]
    787         user, passwd = self.prompt_user_passwd(host, realm)
    788         if user or passwd: self.auth_cache[key] = (user, passwd)
    789         return user, passwd
    790 
    791     def prompt_user_passwd(self, host, realm):
    792         """Override this in a GUI environment!"""
    793         import getpass
    794         try:
    795             user = raw_input("Enter username for %s at %s: " % (realm,
    796                                                                 host))
    797             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
    798                 (user, realm, host))
    799             return user, passwd
    800         except KeyboardInterrupt:
    801             print
    802             return None, None
    803 
    804 
    805 # Utility functions

    806 
    807 _localhost = None
    808 def localhost():
    809     """Return the IP address of the magic hostname 'localhost'."""
    810     global _localhost
    811     if _localhost is None:
    812         _localhost = socket.gethostbyname('localhost')
    813     return _localhost
    814 
    815 _thishost = None
    816 def thishost():
    817     """Return the IP address of the current host."""
    818     global _thishost
    819     if _thishost is None:
    820         _thishost = socket.gethostbyname(socket.gethostname())
    821     return _thishost
    822 
    823 _ftperrors = None
    824 def ftperrors():
    825     """Return the set of errors raised by the FTP class."""
    826     global _ftperrors
    827     if _ftperrors is None:
    828         import ftplib
    829         _ftperrors = ftplib.all_errors
    830     return _ftperrors
    831 
    832 _noheaders = None
    833 def noheaders():
    834     """Return an empty mimetools.Message object."""
    835     global _noheaders
    836     if _noheaders is None:
    837         import mimetools
    838         try:
    839             from cStringIO import StringIO
    840         except ImportError:
    841             from StringIO import StringIO
    842         _noheaders = mimetools.Message(StringIO(), 0)
    843         _noheaders.fp.close()   # Recycle file descriptor

    844     return _noheaders
    845 
    846 
    847 # Utility classes

    848 
    849 class ftpwrapper:
    850     """Class used by open_ftp() for cache of open FTP connections."""
    851 
    852     def __init__(self, user, passwd, host, port, dirs,
    853                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    854         self.user = user
    855         self.passwd = passwd
    856         self.host = host
    857         self.port = port
    858         self.dirs = dirs
    859         self.timeout = timeout
    860         self.init()
    861 
    862     def init(self):
    863         import ftplib
    864         self.busy = 0
    865         self.ftp = ftplib.FTP()
    866         self.ftp.connect(self.host, self.port, self.timeout)
    867         self.ftp.login(self.user, self.passwd)
    868         for dir in self.dirs:
    869             self.ftp.cwd(dir)
    870 
    871     def retrfile(self, file, type):
    872         import ftplib
    873         self.endtransfer()
    874         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
    875         else: cmd = 'TYPE ' + type; isdir = 0
    876         try:
    877             self.ftp.voidcmd(cmd)
    878         except ftplib.all_errors:
    879             self.init()
    880             self.ftp.voidcmd(cmd)
    881         conn = None
    882         if file and not isdir:
    883             # Try to retrieve as a file

    884             try:
    885                 cmd = 'RETR ' + file
    886                 conn = self.ftp.ntransfercmd(cmd)
    887             except ftplib.error_perm, reason:
    888                 if str(reason)[:3] != '550':
    889                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
    890         if not conn:
    891             # Set transfer mode to ASCII!

    892             self.ftp.voidcmd('TYPE A')
    893             # Try a directory listing. Verify that directory exists.

    894             if file:
    895                 pwd = self.ftp.pwd()
    896                 try:
    897                     try:
    898                         self.ftp.cwd(file)
    899                     except ftplib.error_perm, reason:
    900                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
    901                 finally:
    902                     self.ftp.cwd(pwd)
    903                 cmd = 'LIST ' + file
    904             else:
    905                 cmd = 'LIST'
    906             conn = self.ftp.ntransfercmd(cmd)
    907         self.busy = 1
    908         # Pass back both a suitably decorated object and a retrieval length

    909         return (addclosehook(conn[0].makefile('rb'),
    910                              self.endtransfer), conn[1])
    911     def endtransfer(self):
    912         if not self.busy:
    913             return
    914         self.busy = 0
    915         try:
    916             self.ftp.voidresp()
    917         except ftperrors():
    918             pass
    919 
    920     def close(self):
    921         self.endtransfer()
    922         try:
    923             self.ftp.close()
    924         except ftperrors():
    925             pass
    926 
    927 class addbase:
    928     """Base class for addinfo and addclosehook."""
    929 
    930     def __init__(self, fp):
    931         self.fp = fp
    932         self.read = self.fp.read
    933         self.readline = self.fp.readline
    934         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
    935         if hasattr(self.fp, "fileno"):
    936             self.fileno = self.fp.fileno
    937         else:
    938             self.fileno = lambda: None
    939         if hasattr(self.fp, "__iter__"):
    940             self.__iter__ = self.fp.__iter__
    941             if hasattr(self.fp, "next"):
    942                 self.next = self.fp.next
    943 
    944     def __repr__(self):
    945         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
    946                                              id(self), self.fp)
    947 
    948     def close(self):
    949         self.read = None
    950         self.readline = None
    951         self.readlines = None
    952         self.fileno = None
    953         if self.fp: self.fp.close()
    954         self.fp = None
    955 
    956 class addclosehook(addbase):
    957     """Class to add a close hook to an open file."""
    958 
    959     def __init__(self, fp, closehook, *hookargs):
    960         addbase.__init__(self, fp)
    961         self.closehook = closehook
    962         self.hookargs = hookargs
    963 
    964     def close(self):
    965         addbase.close(self)
    966         if self.closehook:
    967             self.closehook(*self.hookargs)
    968             self.closehook = None
    969             self.hookargs = None
    970 
    971 class addinfo(addbase):
    972     """class to add an info() method to an open file."""
    973 
    974     def __init__(self, fp, headers):
    975         addbase.__init__(self, fp)
    976         self.headers = headers
    977 
    978     def info(self):
    979         return self.headers
    980 
    981 class addinfourl(addbase):
    982     """class to add info() and geturl() methods to an open file."""
    983 
    984     def __init__(self, fp, headers, url, code=None):
    985         addbase.__init__(self, fp)
    986         self.headers = headers
    987         self.url = url
    988         self.code = code
    989 
    990     def info(self):
    991         return self.headers
    992 
    993     def getcode(self):
    994         return self.code
    995 
    996     def geturl(self):
    997         return self.url
    998 
    999 
   1000 # Utilities to parse URLs (most of these return None for missing parts):

   1001 # unwrap('<URL:type://host/path>') --> 'type://host/path'

   1002 # splittype('type:opaquestring') --> 'type', 'opaquestring'

   1003 # splithost('//host[:port]/path') --> 'host[:port]', '/path'

   1004 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'

   1005 # splitpasswd('user:passwd') -> 'user', 'passwd'

   1006 # splitport('host:port') --> 'host', 'port'

   1007 # splitquery('/path?query') --> '/path', 'query'

   1008 # splittag('/path#tag') --> '/path', 'tag'

   1009 # splitattr('/path;attr1=value1;attr2=value2;...') ->

   1010 #   '/path', ['attr1=value1', 'attr2=value2', ...]

   1011 # splitvalue('attr=value') --> 'attr', 'value'

   1012 # unquote('abc%20def') -> 'abc def'

   1013 # quote('abc def') -> 'abc%20def')

   1014 
   1015 try:
   1016     unicode
   1017 except NameError:
   1018     def _is_unicode(x):
   1019         return 0
   1020 else:
   1021     def _is_unicode(x):
   1022         return isinstance(x, unicode)
   1023 
   1024 def toBytes(url):
   1025     """toBytes(u"URL") --> 'URL'."""
   1026     # Most URL schemes require ASCII. If that changes, the conversion

   1027     # can be relaxed

   1028     if _is_unicode(url):
   1029         try:
   1030             url = url.encode("ASCII")
   1031         except UnicodeError:
   1032             raise UnicodeError("URL " + repr(url) +
   1033                                " contains non-ASCII characters")
   1034     return url
   1035 
   1036 def unwrap(url):
   1037     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
   1038     url = url.strip()
   1039     if url[:1] == '<' and url[-1:] == '>':
   1040         url = url[1:-1].strip()
   1041     if url[:4] == 'URL:': url = url[4:].strip()
   1042     return url
   1043 
   1044 _typeprog = None
   1045 def splittype(url):
   1046     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
   1047     global _typeprog
   1048     if _typeprog is None:
   1049         import re
   1050         _typeprog = re.compile('^([^/:]+):')
   1051 
   1052     match = _typeprog.match(url)
   1053     if match:
   1054         scheme = match.group(1)
   1055         return scheme.lower(), url[len(scheme) + 1:]
   1056     return None, url
   1057 
   1058 _hostprog = None
   1059 def splithost(url):
   1060     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
   1061     global _hostprog
   1062     if _hostprog is None:
   1063         import re
   1064         _hostprog = re.compile('^//([^/?]*)(.*)$')
   1065 
   1066     match = _hostprog.match(url)
   1067     if match:
   1068         host_port = match.group(1)
   1069         path = match.group(2)
   1070         if path and not path.startswith('/'):
   1071             path = '/' + path
   1072         return host_port, path
   1073     return None, url
   1074 
   1075 _userprog = None
   1076 def splituser(host):
   1077     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
   1078     global _userprog
   1079     if _userprog is None:
   1080         import re
   1081         _userprog = re.compile('^(.*)@(.*)$')
   1082 
   1083     match = _userprog.match(host)
   1084     if match: return match.group(1, 2)
   1085     return None, host
   1086 
   1087 _passwdprog = None
   1088 def splitpasswd(user):
   1089     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
   1090     global _passwdprog
   1091     if _passwdprog is None:
   1092         import re
   1093         _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
   1094 
   1095     match = _passwdprog.match(user)
   1096     if match: return match.group(1, 2)
   1097     return user, None
   1098 
   1099 # splittag('/path#tag') --> '/path', 'tag'

   1100 _portprog = None
   1101 def splitport(host):
   1102     """splitport('host:port') --> 'host', 'port'."""
   1103     global _portprog
   1104     if _portprog is None:
   1105         import re
   1106         _portprog = re.compile('^(.*):([0-9]+)$')
   1107 
   1108     match = _portprog.match(host)
   1109     if match: return match.group(1, 2)
   1110     return host, None
   1111 
   1112 _nportprog = None
   1113 def splitnport(host, defport=-1):
   1114     """Split host and port, returning numeric port.
   1115     Return given default port if no ':' found; defaults to -1.
   1116     Return numerical port if a valid number are found after ':'.
   1117     Return None if ':' but not a valid number."""
   1118     global _nportprog
   1119     if _nportprog is None:
   1120         import re
   1121         _nportprog = re.compile('^(.*):(.*)$')
   1122 
   1123     match = _nportprog.match(host)
   1124     if match:
   1125         host, port = match.group(1, 2)
   1126         try:
   1127             if not port: raise ValueError, "no digits"
   1128             nport = int(port)
   1129         except ValueError:
   1130             nport = None
   1131         return host, nport
   1132     return host, defport
   1133 
   1134 _queryprog = None
   1135 def splitquery(url):
   1136     """splitquery('/path?query') --> '/path', 'query'."""
   1137     global _queryprog
   1138     if _queryprog is None:
   1139         import re
   1140         _queryprog = re.compile('^(.*)\?([^?]*)$')
   1141 
   1142     match = _queryprog.match(url)
   1143     if match: return match.group(1, 2)
   1144     return url, None
   1145 
   1146 _tagprog = None
   1147 def splittag(url):
   1148     """splittag('/path#tag') --> '/path', 'tag'."""
   1149     global _tagprog
   1150     if _tagprog is None:
   1151         import re
   1152         _tagprog = re.compile('^(.*)#([^#]*)$')
   1153 
   1154     match = _tagprog.match(url)
   1155     if match: return match.group(1, 2)
   1156     return url, None
   1157 
   1158 def splitattr(url):
   1159     """splitattr('/path;attr1=value1;attr2=value2;...') ->
   1160         '/path', ['attr1=value1', 'attr2=value2', ...]."""
   1161     words = url.split(';')
   1162     return words[0], words[1:]
   1163 
   1164 _valueprog = None
   1165 def splitvalue(attr):
   1166     """splitvalue('attr=value') --> 'attr', 'value'."""
   1167     global _valueprog
   1168     if _valueprog is None:
   1169         import re
   1170         _valueprog = re.compile('^([^=]*)=(.*)$')
   1171 
   1172     match = _valueprog.match(attr)
   1173     if match: return match.group(1, 2)
   1174     return attr, None
   1175 
   1176 # urlparse contains a duplicate of this method to avoid a circular import.  If

   1177 # you update this method, also update the copy in urlparse.  This code

   1178 # duplication does not exist in Python3.

   1179 
   1180 _hexdig = '0123456789ABCDEFabcdef'
   1181 _hextochr = dict((a + b, chr(int(a + b, 16)))
   1182                  for a in _hexdig for b in _hexdig)
   1183 
   1184 def unquote(s):
   1185     """unquote('abc%20def') -> 'abc def'."""
   1186     res = s.split('%')
   1187     # fastpath

   1188     if len(res) == 1:
   1189         return s
   1190     s = res[0]
   1191     for item in res[1:]:
   1192         try:
   1193             s += _hextochr[item[:2]] + item[2:]
   1194         except KeyError:
   1195             s += '%' + item
   1196         except UnicodeDecodeError:
   1197             s += unichr(int(item[:2], 16)) + item[2:]
   1198     return s
   1199 
   1200 def unquote_plus(s):
   1201     """unquote('%7e/abc+def') -> '~/abc def'"""
   1202     s = s.replace('+', ' ')
   1203     return unquote(s)
   1204 
   1205 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
   1206                'abcdefghijklmnopqrstuvwxyz'
   1207                '0123456789' '_.-')
   1208 _safe_map = {}
   1209 for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
   1210     _safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
   1211 _safe_quoters = {}
   1212 
   1213 def quote(s, safe='/'):
   1214     """quote('abc def') -> 'abc%20def'
   1215 
   1216     Each part of a URL, e.g. the path info, the query, etc., has a
   1217     different set of reserved characters that must be quoted.
   1218 
   1219     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
   1220     the following reserved characters.
   1221 
   1222     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
   1223                   "$" | ","
   1224 
   1225     Each of these characters is reserved in some component of a URL,
   1226     but not necessarily in all of them.
   1227 
   1228     By default, the quote function is intended for quoting the path
   1229     section of a URL.  Thus, it will not encode '/'.  This character
   1230     is reserved, but in typical usage the quote function is being
   1231     called on a path where the existing slash characters are used as
   1232     reserved characters.
   1233     """
   1234     # fastpath

   1235     if not s:
   1236         if s is None:
   1237             raise TypeError('None object cannot be quoted')
   1238         return s
   1239     cachekey = (safe, always_safe)
   1240     try:
   1241         (quoter, safe) = _safe_quoters[cachekey]
   1242     except KeyError:
   1243         safe_map = _safe_map.copy()
   1244         safe_map.update([(c, c) for c in safe])
   1245         quoter = safe_map.__getitem__
   1246         safe = always_safe + safe
   1247         _safe_quoters[cachekey] = (quoter, safe)
   1248     if not s.rstrip(safe):
   1249         return s
   1250     return ''.join(map(quoter, s))
   1251 
   1252 def quote_plus(s, safe=''):
   1253     """Quote the query fragment of a URL; replacing ' ' with '+'"""
   1254     if ' ' in s:
   1255         s = quote(s, safe + ' ')
   1256         return s.replace(' ', '+')
   1257     return quote(s, safe)
   1258 
   1259 def urlencode(query, doseq=0):
   1260     """Encode a sequence of two-element tuples or dictionary into a URL query string.
   1261 
   1262     If any values in the query arg are sequences and doseq is true, each
   1263     sequence element is converted to a separate parameter.
   1264 
   1265     If the query arg is a sequence of two-element tuples, the order of the
   1266     parameters in the output will match the order of parameters in the
   1267     input.
   1268     """
   1269 
   1270     if hasattr(query,"items"):
   1271         # mapping objects

   1272         query = query.items()
   1273     else:
   1274         # it's a bother at times that strings and string-like objects are

   1275         # sequences...

   1276         try:
   1277             # non-sequence items should not work with len()

   1278             # non-empty strings will fail this

   1279             if len(query) and not isinstance(query[0], tuple):
   1280                 raise TypeError
   1281             # zero-length sequences of all types will get here and succeed,

   1282             # but that's a minor nit - since the original implementation

   1283             # allowed empty dicts that type of behavior probably should be

   1284             # preserved for consistency

   1285         except TypeError:
   1286             ty,va,tb = sys.exc_info()
   1287             raise TypeError, "not a valid non-string sequence or mapping object", tb
   1288 
   1289     l = []
   1290     if not doseq:
   1291         # preserve old behavior

   1292         for k, v in query:
   1293             k = quote_plus(str(k))
   1294             v = quote_plus(str(v))
   1295             l.append(k + '=' + v)
   1296     else:
   1297         for k, v in query:
   1298             k = quote_plus(str(k))
   1299             if isinstance(v, str):
   1300                 v = quote_plus(v)
   1301                 l.append(k + '=' + v)
   1302             elif _is_unicode(v):
   1303                 # is there a reasonable way to convert to ASCII?

   1304                 # encode generates a string, but "replace" or "ignore"

   1305                 # lose information and "strict" can raise UnicodeError

   1306                 v = quote_plus(v.encode("ASCII","replace"))
   1307                 l.append(k + '=' + v)
   1308             else:
   1309                 try:
   1310                     # is this a sufficient test for sequence-ness?

   1311                     len(v)
   1312                 except TypeError:
   1313                     # not a sequence

   1314                     v = quote_plus(str(v))
   1315                     l.append(k + '=' + v)
   1316                 else:
   1317                     # loop over the sequence

   1318                     for elt in v:
   1319                         l.append(k + '=' + quote_plus(str(elt)))
   1320     return '&'.join(l)
   1321 
   1322 # Proxy handling

   1323 def getproxies_environment():
   1324     """Return a dictionary of scheme -> proxy server URL mappings.
   1325 
   1326     Scan the environment for variables named <scheme>_proxy;
   1327     this seems to be the standard convention.  If you need a
   1328     different way, you can pass a proxies dictionary to the
   1329     [Fancy]URLopener constructor.
   1330 
   1331     """
   1332     proxies = {}
   1333     for name, value in os.environ.items():
   1334         name = name.lower()
   1335         if value and name[-6:] == '_proxy':
   1336             proxies[name[:-6]] = value
   1337     return proxies
   1338 
   1339 def proxy_bypass_environment(host):
   1340     """Test if proxies should not be used for a particular host.
   1341 
   1342     Checks the environment for a variable named no_proxy, which should
   1343     be a list of DNS suffixes separated by commas, or '*' for all hosts.
   1344     """
   1345     no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
   1346     # '*' is special case for always bypass

   1347     if no_proxy == '*':
   1348         return 1
   1349     # strip port off host

   1350     hostonly, port = splitport(host)
   1351     # check if the host ends with any of the DNS suffixes

   1352     for name in no_proxy.split(','):
   1353         if name and (hostonly.endswith(name) or host.endswith(name)):
   1354             return 1
   1355     # otherwise, don't bypass

   1356     return 0
   1357 
   1358 
   1359 if sys.platform == 'darwin':
   1360     from _scproxy import _get_proxy_settings, _get_proxies
   1361 
   1362     def proxy_bypass_macosx_sysconf(host):
   1363         """
   1364         Return True iff this host shouldn't be accessed using a proxy
   1365 
   1366         This function uses the MacOSX framework SystemConfiguration
   1367         to fetch the proxy information.
   1368         """
   1369         import re
   1370         import socket
   1371         from fnmatch import fnmatch
   1372 
   1373         hostonly, port = splitport(host)
   1374 
   1375         def ip2num(ipAddr):
   1376             parts = ipAddr.split('.')
   1377             parts = map(int, parts)
   1378             if len(parts) != 4:
   1379                 parts = (parts + [0, 0, 0, 0])[:4]
   1380             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
   1381 
   1382         proxy_settings = _get_proxy_settings()
   1383 
   1384         # Check for simple host names:

   1385         if '.' not in host:
   1386             if proxy_settings['exclude_simple']:
   1387                 return True
   1388 
   1389         hostIP = None
   1390 
   1391         for value in proxy_settings.get('exceptions', ()):
   1392             # Items in the list are strings like these: *.local, 169.254/16

   1393             if not value: continue
   1394 
   1395             m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
   1396             if m is not None:
   1397                 if hostIP is None:
   1398                     try:
   1399                         hostIP = socket.gethostbyname(hostonly)
   1400                         hostIP = ip2num(hostIP)
   1401                     except socket.error:
   1402                         continue
   1403 
   1404                 base = ip2num(m.group(1))
   1405                 mask = m.group(2)
   1406                 if mask is None:
   1407                     mask = 8 * (m.group(1).count('.') + 1)
   1408 
   1409                 else:
   1410                     mask = int(mask[1:])
   1411                 mask = 32 - mask
   1412 
   1413                 if (hostIP >> mask) == (base >> mask):
   1414                     return True
   1415 
   1416             elif fnmatch(host, value):
   1417                 return True
   1418 
   1419         return False
   1420 
   1421     def getproxies_macosx_sysconf():
   1422         """Return a dictionary of scheme -> proxy server URL mappings.
   1423 
   1424         This function uses the MacOSX framework SystemConfiguration
   1425         to fetch the proxy information.
   1426         """
   1427         return _get_proxies()
   1428 
   1429     def proxy_bypass(host):
   1430         if getproxies_environment():
   1431             return proxy_bypass_environment(host)
   1432         else:
   1433             return proxy_bypass_macosx_sysconf(host)
   1434 
   1435     def getproxies():
   1436         return getproxies_environment() or getproxies_macosx_sysconf()
   1437 
   1438 elif os.name == 'nt':
   1439     def getproxies_registry():
   1440         """Return a dictionary of scheme -> proxy server URL mappings.
   1441 
   1442         Win32 uses the registry to store proxies.
   1443 
   1444         """
   1445         proxies = {}
   1446         try:
   1447             import _winreg
   1448         except ImportError:
   1449             # Std module, so should be around - but you never know!

   1450             return proxies
   1451         try:
   1452             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
   1453                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
   1454             proxyEnable = _winreg.QueryValueEx(internetSettings,
   1455                                                'ProxyEnable')[0]
   1456             if proxyEnable:
   1457                 # Returned as Unicode but problems if not converted to ASCII

   1458                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
   1459                                                        'ProxyServer')[0])
   1460                 if '=' in proxyServer:
   1461                     # Per-protocol settings

   1462                     for p in proxyServer.split(';'):
   1463                         protocol, address = p.split('=', 1)
   1464                         # See if address has a type:// prefix

   1465                         import re
   1466                         if not re.match('^([^/:]+)://', address):
   1467                             address = '%s://%s' % (protocol, address)
   1468                         proxies[protocol] = address
   1469                 else:
   1470                     # Use one setting for all protocols

   1471                     if proxyServer[:5] == 'http:':
   1472                         proxies['http'] = proxyServer
   1473                     else:
   1474                         proxies['http'] = 'http://%s' % proxyServer
   1475                         proxies['https'] = 'https://%s' % proxyServer
   1476                         proxies['ftp'] = 'ftp://%s' % proxyServer
   1477             internetSettings.Close()
   1478         except (WindowsError, ValueError, TypeError):
   1479             # Either registry key not found etc, or the value in an

   1480             # unexpected format.

   1481             # proxies already set up to be empty so nothing to do

   1482             pass
   1483         return proxies
   1484 
   1485     def getproxies():
   1486         """Return a dictionary of scheme -> proxy server URL mappings.
   1487 
   1488         Returns settings gathered from the environment, if specified,
   1489         or the registry.
   1490 
   1491         """
   1492         return getproxies_environment() or getproxies_registry()
   1493 
   1494     def proxy_bypass_registry(host):
   1495         try:
   1496             import _winreg
   1497             import re
   1498         except ImportError:
   1499             # Std modules, so should be around - but you never know!

   1500             return 0
   1501         try:
   1502             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
   1503                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
   1504             proxyEnable = _winreg.QueryValueEx(internetSettings,
   1505                                                'ProxyEnable')[0]
   1506             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
   1507                                                      'ProxyOverride')[0])
   1508             # ^^^^ Returned as Unicode but problems if not converted to ASCII

   1509         except WindowsError:
   1510             return 0
   1511         if not proxyEnable or not proxyOverride:
   1512             return 0
   1513         # try to make a host list from name and IP address.

   1514         rawHost, port = splitport(host)
   1515         host = [rawHost]
   1516         try:
   1517             addr = socket.gethostbyname(rawHost)
   1518             if addr != rawHost:
   1519                 host.append(addr)
   1520         except socket.error:
   1521             pass
   1522         try:
   1523             fqdn = socket.getfqdn(rawHost)
   1524             if fqdn != rawHost:
   1525                 host.append(fqdn)
   1526         except socket.error:
   1527             pass
   1528         # make a check value list from the registry entry: replace the

   1529         # '<local>' string by the localhost entry and the corresponding

   1530         # canonical entry.

   1531         proxyOverride = proxyOverride.split(';')
   1532         # now check if we match one of the registry values.

   1533         for test in proxyOverride:
   1534             if test == '<local>':
   1535                 if '.' not in rawHost:
   1536                     return 1
   1537             test = test.replace(".", r"\.")     # mask dots

   1538             test = test.replace("*", r".*")     # change glob sequence

   1539             test = test.replace("?", r".")      # change glob char

   1540             for val in host:
   1541                 # print "%s <--> %s" %( test, val )

   1542                 if re.match(test, val, re.I):
   1543                     return 1
   1544         return 0
   1545 
   1546     def proxy_bypass(host):
   1547         """Return a dictionary of scheme -> proxy server URL mappings.
   1548 
   1549         Returns settings gathered from the environment, if specified,
   1550         or the registry.
   1551 
   1552         """
   1553         if getproxies_environment():
   1554             return proxy_bypass_environment(host)
   1555         else:
   1556             return proxy_bypass_registry(host)
   1557 
   1558 else:
   1559     # By default use environment variables

   1560     getproxies = getproxies_environment
   1561     proxy_bypass = proxy_bypass_environment
   1562 
   1563 # Test and time quote() and unquote()

   1564 def test1():
   1565     s = ''
   1566     for i in range(256): s = s + chr(i)
   1567     s = s*4
   1568     t0 = time.time()
   1569     qs = quote(s)
   1570     uqs = unquote(qs)
   1571     t1 = time.time()
   1572     if uqs != s:
   1573         print 'Wrong!'
   1574     print repr(s)
   1575     print repr(qs)
   1576     print repr(uqs)
   1577     print round(t1 - t0, 3), 'sec'
   1578 
   1579 
   1580 def reporthook(blocknum, blocksize, totalsize):
   1581     # Report during remote transfers

   1582     print "Block number: %d, Block size: %d, Total size: %d" % (
   1583         blocknum, blocksize, totalsize)
   1584 
   1585 # Test program

   1586 def test(args=[]):
   1587     if not args:
   1588         args = [
   1589             '/etc/passwd',
   1590             'file:/etc/passwd',
   1591             'file://localhost/etc/passwd',
   1592             'ftp://ftp.gnu.org/pub/README',
   1593             'http://www.python.org/index.html',
   1594             ]
   1595         if hasattr(URLopener, "open_https"):
   1596             args.append('https://synergy.as.cmu.edu/~geek/')
   1597     try:
   1598         for url in args:
   1599             print '-'*10, url, '-'*10
   1600             fn, h = urlretrieve(url, None, reporthook)
   1601             print fn
   1602             if h:
   1603                 print '======'
   1604                 for k in h.keys(): print k + ':', h[k]
   1605                 print '======'
   1606             with open(fn, 'rb') as fp:
   1607                 data = fp.read()
   1608             if '\r' in data:
   1609                 table = string.maketrans("", "")
   1610                 data = data.translate(table, "\r")
   1611             print data
   1612             fn, h = None, None
   1613         print '-'*40
   1614     finally:
   1615         urlcleanup()
   1616 
   1617 def main():
   1618     import getopt, sys
   1619     try:
   1620         opts, args = getopt.getopt(sys.argv[1:], "th")
   1621     except getopt.error, msg:
   1622         print msg
   1623         print "Use -h for help"
   1624         return
   1625     t = 0
   1626     for o, a in opts:
   1627         if o == '-t':
   1628             t = t + 1
   1629         if o == '-h':
   1630             print "Usage: python urllib.py [-t] [url ...]"
   1631             print "-t runs self-test;",
   1632             print "otherwise, contents of urls are printed"
   1633             return
   1634     if t:
   1635         if t > 1:
   1636             test1()
   1637         test(args)
   1638     else:
   1639         if not args:
   1640             print "Use -h for help"
   1641         for url in args:
   1642             print urlopen(url).read(),
   1643 
   1644 # Run test program when run as a script

   1645 if __name__ == '__main__':
   1646     main()
   1647