Home | History | Annotate | Download | only in Lib
      1 """Open an arbitrary URL.
      2 
      3 See the following document for more info on URLs:
      4 "Names and Addresses, URIs, URLs, URNs, URCs", at
      5 http://www.w3.org/pub/WWW/Addressing/Overview.html
      6 
      7 See also the HTTP spec (from which the error codes are derived):
      8 "HTTP - Hypertext Transfer Protocol", at
      9 http://www.w3.org/pub/WWW/Protocols/
     10 
     11 Related standards and specs:
     12 - RFC1808: the "relative URL" spec. (authoritative status)
     13 - RFC1738 - the "URL standard". (authoritative status)
     14 - RFC1630 - the "URI spec". (informational status)
     15 
     16 The object returned by URLopener().open(file) will differ per
     17 protocol.  All you know is that is has methods read(), readline(),
     18 readlines(), fileno(), close() and info().  The read*(), fileno()
     19 and close() methods work like those of open files.
     20 The info() method returns a mimetools.Message object which can be
     21 used to query various info about the object, if available.
     22 (mimetools.Message objects are queried with the getheader() method.)
     23 """
     24 
     25 import string
     26 import socket
     27 import os
     28 import time
     29 import sys
     30 import base64
     31 import re
     32 
     33 from urlparse import urljoin as basejoin
     34 
     35 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
     36            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
     37            "urlencode", "url2pathname", "pathname2url", "splittag",
     38            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
     39            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
     40            "splitnport", "splitquery", "splitattr", "splitvalue",
     41            "getproxies"]
     42 
     43 __version__ = '1.17'    # XXX This version is not always updated :-(
     44 
     45 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
     46 
     47 # Helper for non-unix systems
     48 if os.name == 'nt':
     49     from nturl2path import url2pathname, pathname2url
     50 elif os.name == 'riscos':
     51     from rourl2path import url2pathname, pathname2url
     52 else:
     53     def url2pathname(pathname):
     54         """OS-specific conversion from a relative URL of the 'file' scheme
     55         to a file system path; not recommended for general use."""
     56         return unquote(pathname)
     57 
     58     def pathname2url(pathname):
     59         """OS-specific conversion from a file system path to a relative URL
     60         of the 'file' scheme; not recommended for general use."""
     61         return quote(pathname)
     62 
     63 # This really consists of two pieces:
     64 # (1) a class which handles opening of all sorts of URLs
     65 #     (plus assorted utilities etc.)
     66 # (2) a set of functions for parsing URLs
     67 # XXX Should these be separated out into different modules?
     68 
     69 
     70 # Shortcut for basic usage
     71 _urlopener = None
     72 def urlopen(url, data=None, proxies=None, context=None):
     73     """Create a file-like object for the specified URL to read from."""
     74     from warnings import warnpy3k
     75     warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
     76              "favor of urllib2.urlopen()", stacklevel=2)
     77 
     78     global _urlopener
     79     if proxies is not None or context is not None:
     80         opener = FancyURLopener(proxies=proxies, context=context)
     81     elif not _urlopener:
     82         opener = FancyURLopener()
     83         _urlopener = opener
     84     else:
     85         opener = _urlopener
     86     if data is None:
     87         return opener.open(url)
     88     else:
     89         return opener.open(url, data)
     90 def urlretrieve(url, filename=None, reporthook=None, data=None, context=None):
     91     global _urlopener
     92     if context is not None:
     93         opener = FancyURLopener(context=context)
     94     elif not _urlopener:
     95         _urlopener = opener = FancyURLopener()
     96     else:
     97         opener = _urlopener
     98     return opener.retrieve(url, filename, reporthook, data)
     99 def urlcleanup():
    100     if _urlopener:
    101         _urlopener.cleanup()
    102     _safe_quoters.clear()
    103     ftpcache.clear()
    104 
    105 # check for SSL
    106 try:
    107     import ssl
    108 except:
    109     _have_ssl = False
    110 else:
    111     _have_ssl = True
    112 
    113 # exception raised when downloaded size does not match content-length
    114 class ContentTooShortError(IOError):
    115     def __init__(self, message, content):
    116         IOError.__init__(self, message)
    117         self.content = content
    118 
    119 ftpcache = {}
    120 class URLopener:
    121     """Class to open URLs.
    122     This is a class rather than just a subroutine because we may need
    123     more than one set of global protocol-specific options.
    124     Note -- this is a base class for those who don't want the
    125     automatic handling of errors type 302 (relocated) and 401
    126     (authorization needed)."""
    127 
    128     __tempfiles = None
    129 
    130     version = "Python-urllib/%s" % __version__
    131 
    132     # Constructor
    133     def __init__(self, proxies=None, context=None, **x509):
    134         if proxies is None:
    135             proxies = getproxies()
    136         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
    137         self.proxies = proxies
    138         self.key_file = x509.get('key_file')
    139         self.cert_file = x509.get('cert_file')
    140         self.context = context
    141         self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')]
    142         self.__tempfiles = []
    143         self.__unlink = os.unlink # See cleanup()
    144         self.tempcache = None
    145         # Undocumented feature: if you assign {} to tempcache,
    146         # it is used to cache files retrieved with
    147         # self.retrieve().  This is not enabled by default
    148         # since it does not work for changing documents (and I
    149         # haven't got the logic to check expiration headers
    150         # yet).
    151         self.ftpcache = ftpcache
    152         # Undocumented feature: you can use a different
    153         # ftp cache by assigning to the .ftpcache member;
    154         # in case you want logically independent URL openers
    155         # XXX This is not threadsafe.  Bah.
    156 
    157     def __del__(self):
    158         self.close()
    159 
    160     def close(self):
    161         self.cleanup()
    162 
    163     def cleanup(self):
    164         # This code sometimes runs when the rest of this module
    165         # has already been deleted, so it can't use any globals
    166         # or import anything.
    167         if self.__tempfiles:
    168             for file in self.__tempfiles:
    169                 try:
    170                     self.__unlink(file)
    171                 except OSError:
    172                     pass
    173             del self.__tempfiles[:]
    174         if self.tempcache:
    175             self.tempcache.clear()
    176 
    177     def addheader(self, *args):
    178         """Add a header to be used by the HTTP interface only
    179         e.g. u.addheader('Accept', 'sound/basic')"""
    180         self.addheaders.append(args)
    181 
    182     # External interface
    183     def open(self, fullurl, data=None):
    184         """Use URLopener().open(file) instead of open(file, 'r')."""
    185         fullurl = unwrap(toBytes(fullurl))
    186         # percent encode url, fixing lame server errors for e.g, like space
    187         # within url paths.
    188         fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
    189         if self.tempcache and fullurl in self.tempcache:
    190             filename, headers = self.tempcache[fullurl]
    191             fp = open(filename, 'rb')
    192             return addinfourl(fp, headers, fullurl)
    193         urltype, url = splittype(fullurl)
    194         if not urltype:
    195             urltype = 'file'
    196         if urltype in self.proxies:
    197             proxy = self.proxies[urltype]
    198             urltype, proxyhost = splittype(proxy)
    199             host, selector = splithost(proxyhost)
    200             url = (host, fullurl) # Signal special case to open_*()
    201         else:
    202             proxy = None
    203         name = 'open_' + urltype
    204         self.type = urltype
    205         name = name.replace('-', '_')
    206         if not hasattr(self, name):
    207             if proxy:
    208                 return self.open_unknown_proxy(proxy, fullurl, data)
    209             else:
    210                 return self.open_unknown(fullurl, data)
    211         try:
    212             if data is None:
    213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)
    216         except socket.error, msg:
    217             raise IOError, ('socket error', msg), sys.exc_info()[2]
    218 
    219     def open_unknown(self, fullurl, data=None):
    220         """Overridable interface to open unknown URL type."""
    221         type, url = splittype(fullurl)
    222         raise IOError, ('url error', 'unknown url type', type)
    223 
    224     def open_unknown_proxy(self, proxy, fullurl, data=None):
    225         """Overridable interface to open unknown URL type."""
    226         type, url = splittype(fullurl)
    227         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
    228 
    229     # External interface
    230     def retrieve(self, url, filename=None, reporthook=None, data=None):
    231         """retrieve(url) returns (filename, headers) for a local object
    232         or (tempfilename, headers) for a remote object."""
    233         url = unwrap(toBytes(url))
    234         if self.tempcache and url in self.tempcache:
    235             return self.tempcache[url]
    236         type, url1 = splittype(url)
    237         if filename is None and (not type or type == 'file'):
    238             try:
    239                 fp = self.open_local_file(url1)
    240                 hdrs = fp.info()
    241                 fp.close()
    242                 return url2pathname(splithost(url1)[1]), hdrs
    243             except IOError:
    244                 pass
    245         fp = self.open(url, data)
    246         try:
    247             headers = fp.info()
    248             if filename:
    249                 tfp = open(filename, 'wb')
    250             else:
    251                 import tempfile
    252                 garbage, path = splittype(url)
    253                 garbage, path = splithost(path or "")
    254                 path, garbage = splitquery(path or "")
    255                 path, garbage = splitattr(path or "")
    256                 suffix = os.path.splitext(path)[1]
    257                 (fd, filename) = tempfile.mkstemp(suffix)
    258                 self.__tempfiles.append(filename)
    259                 tfp = os.fdopen(fd, 'wb')
    260             try:
    261                 result = filename, headers
    262                 if self.tempcache is not None:
    263                     self.tempcache[url] = result
    264                 bs = 1024*8
    265                 size = -1
    266                 read = 0
    267                 blocknum = 0
    268                 if "content-length" in headers:
    269                     size = int(headers["Content-Length"])
    270                 if reporthook:
    271                     reporthook(blocknum, bs, size)
    272                 while 1:
    273                     block = fp.read(bs)
    274                     if block == "":
    275                         break
    276                     read += len(block)
    277                     tfp.write(block)
    278                     blocknum += 1
    279                     if reporthook:
    280                         reporthook(blocknum, bs, size)
    281             finally:
    282                 tfp.close()
    283         finally:
    284             fp.close()
    285 
    286         # raise exception if actual size does not match content-length header
    287         if size >= 0 and read < size:
    288             raise ContentTooShortError("retrieval incomplete: got only %i out "
    289                                        "of %i bytes" % (read, size), result)
    290 
    291         return result
    292 
    293     # Each method named open_<type> knows how to open that type of URL
    294 
    295     def open_http(self, url, data=None):
    296         """Use HTTP protocol."""
    297         import httplib
    298         user_passwd = None
    299         proxy_passwd= None
    300         if isinstance(url, str):
    301             host, selector = splithost(url)
    302             if host:
    303                 user_passwd, host = splituser(host)
    304                 host = unquote(host)
    305             realhost = host
    306         else:
    307             host, selector = url
    308             # check whether the proxy contains authorization information
    309             proxy_passwd, host = splituser(host)
    310             # now we proceed with the url we want to obtain
    311             urltype, rest = splittype(selector)
    312             url = rest
    313             user_passwd = None
    314             if urltype.lower() != 'http':
    315                 realhost = None
    316             else:
    317                 realhost, rest = splithost(rest)
    318                 if realhost:
    319                     user_passwd, realhost = splituser(realhost)
    320                 if user_passwd:
    321                     selector = "%s://%s%s" % (urltype, realhost, rest)
    322                 if proxy_bypass(realhost):
    323                     host = realhost
    324 
    325             #print "proxy via http:", host, selector
    326         if not host: raise IOError, ('http error', 'no host given')
    327 
    328         if proxy_passwd:
    329             proxy_passwd = unquote(proxy_passwd)
    330             proxy_auth = base64.b64encode(proxy_passwd).strip()
    331         else:
    332             proxy_auth = None
    333 
    334         if user_passwd:
    335             user_passwd = unquote(user_passwd)
    336             auth = base64.b64encode(user_passwd).strip()
    337         else:
    338             auth = None
    339         h = httplib.HTTP(host)
    340         if data is not None:
    341             h.putrequest('POST', selector)
    342             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
    343             h.putheader('Content-Length', '%d' % len(data))
    344         else:
    345             h.putrequest('GET', selector)
    346         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    347         if auth: h.putheader('Authorization', 'Basic %s' % auth)
    348         if realhost: h.putheader('Host', realhost)
    349         for args in self.addheaders: h.putheader(*args)
    350         h.endheaders(data)
    351         errcode, errmsg, headers = h.getreply()
    352         fp = h.getfile()
    353         if errcode == -1:
    354             if fp: fp.close()
    355             # something went wrong with the HTTP status line
    356             raise IOError, ('http protocol error', 0,
    357                             'got a bad status line', None)
    358         # According to RFC 2616, "2xx" code indicates that the client's
    359         # request was successfully received, understood, and accepted.
    360         if (200 <= errcode < 300):
    361             return addinfourl(fp, headers, "http:" + url, errcode)
    362         else:
    363             if data is None:
    364                 return self.http_error(url, fp, errcode, errmsg, headers)
    365             else:
    366                 return self.http_error(url, fp, errcode, errmsg, headers, data)
    367 
    368     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
    369         """Handle http errors.
    370         Derived class can override this, or provide specific handlers
    371         named http_error_DDD where DDD is the 3-digit error code."""
    372         # First check if there's a specific handler for this error
    373         name = 'http_error_%d' % errcode
    374         if hasattr(self, name):
    375             method = getattr(self, name)
    376             if data is None:
    377                 result = method(url, fp, errcode, errmsg, headers)
    378             else:
    379                 result = method(url, fp, errcode, errmsg, headers, data)
    380             if result: return result
    381         return self.http_error_default(url, fp, errcode, errmsg, headers)
    382 
    383     def http_error_default(self, url, fp, errcode, errmsg, headers):
    384         """Default error handler: close the connection and raise IOError."""
    385         fp.close()
    386         raise IOError, ('http error', errcode, errmsg, headers)
    387 
    388     if _have_ssl:
    389         def open_https(self, url, data=None):
    390             """Use HTTPS protocol."""
    391 
    392             import httplib
    393             user_passwd = None
    394             proxy_passwd = None
    395             if isinstance(url, str):
    396                 host, selector = splithost(url)
    397                 if host:
    398                     user_passwd, host = splituser(host)
    399                     host = unquote(host)
    400                 realhost = host
    401             else:
    402                 host, selector = url
    403                 # here, we determine, whether the proxy contains authorization information
    404                 proxy_passwd, host = splituser(host)
    405                 urltype, rest = splittype(selector)
    406                 url = rest
    407                 user_passwd = None
    408                 if urltype.lower() != 'https':
    409                     realhost = None
    410                 else:
    411                     realhost, rest = splithost(rest)
    412                     if realhost:
    413                         user_passwd, realhost = splituser(realhost)
    414                     if user_passwd:
    415                         selector = "%s://%s%s" % (urltype, realhost, rest)
    416                 #print "proxy via https:", host, selector
    417             if not host: raise IOError, ('https error', 'no host given')
    418             if proxy_passwd:
    419                 proxy_passwd = unquote(proxy_passwd)
    420                 proxy_auth = base64.b64encode(proxy_passwd).strip()
    421             else:
    422                 proxy_auth = None
    423             if user_passwd:
    424                 user_passwd = unquote(user_passwd)
    425                 auth = base64.b64encode(user_passwd).strip()
    426             else:
    427                 auth = None
    428             h = httplib.HTTPS(host, 0,
    429                               key_file=self.key_file,
    430                               cert_file=self.cert_file,
    431                               context=self.context)
    432             if data is not None:
    433                 h.putrequest('POST', selector)
    434                 h.putheader('Content-Type',
    435                             'application/x-www-form-urlencoded')
    436                 h.putheader('Content-Length', '%d' % len(data))
    437             else:
    438                 h.putrequest('GET', selector)
    439             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    440             if auth: h.putheader('Authorization', 'Basic %s' % auth)
    441             if realhost: h.putheader('Host', realhost)
    442             for args in self.addheaders: h.putheader(*args)
    443             h.endheaders(data)
    444             errcode, errmsg, headers = h.getreply()
    445             fp = h.getfile()
    446             if errcode == -1:
    447                 if fp: fp.close()
    448                 # something went wrong with the HTTP status line
    449                 raise IOError, ('http protocol error', 0,
    450                                 'got a bad status line', None)
    451             # According to RFC 2616, "2xx" code indicates that the client's
    452             # request was successfully received, understood, and accepted.
    453             if (200 <= errcode < 300):
    454                 return addinfourl(fp, headers, "https:" + url, errcode)
    455             else:
    456                 if data is None:
    457                     return self.http_error(url, fp, errcode, errmsg, headers)
    458                 else:
    459                     return self.http_error(url, fp, errcode, errmsg, headers,
    460                                            data)
    461 
    462     def open_file(self, url):
    463         """Use local file or FTP depending on form of URL."""
    464         if not isinstance(url, str):
    465             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
    466         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
    467             return self.open_ftp(url)
    468         else:
    469             return self.open_local_file(url)
    470 
    471     def open_local_file(self, url):
    472         """Use local file."""
    473         import mimetypes, mimetools, email.utils
    474         try:
    475             from cStringIO import StringIO
    476         except ImportError:
    477             from StringIO import StringIO
    478         host, file = splithost(url)
    479         localname = url2pathname(file)
    480         try:
    481             stats = os.stat(localname)
    482         except OSError, e:
    483             raise IOError(e.errno, e.strerror, e.filename)
    484         size = stats.st_size
    485         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
    486         mtype = mimetypes.guess_type(url)[0]
    487         headers = mimetools.Message(StringIO(
    488             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
    489             (mtype or 'text/plain', size, modified)))
    490         if not host:
    491             urlfile = file
    492             if file[:1] == '/':
    493                 urlfile = 'file://' + file
    494             elif file[:2] == './':
    495                 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
    496             return addinfourl(open(localname, 'rb'),
    497                               headers, urlfile)
    498         host, port = splitport(host)
    499         if not port \
    500            and socket.gethostbyname(host) in (localhost(), thishost()):
    501             urlfile = file
    502             if file[:1] == '/':
    503                 urlfile = 'file://' + file
    504             return addinfourl(open(localname, 'rb'),
    505                               headers, urlfile)
    506         raise IOError, ('local file error', 'not on local host')
    507 
    508     def open_ftp(self, url):
    509         """Use FTP protocol."""
    510         if not isinstance(url, str):
    511             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
    512         import mimetypes, mimetools
    513         try:
    514             from cStringIO import StringIO
    515         except ImportError:
    516             from StringIO import StringIO
    517         host, path = splithost(url)
    518         if not host: raise IOError, ('ftp error', 'no host given')
    519         host, port = splitport(host)
    520         user, host = splituser(host)
    521         if user: user, passwd = splitpasswd(user)
    522         else: passwd = None
    523         host = unquote(host)
    524         user = user or ''
    525         passwd = passwd or ''
    526         host = socket.gethostbyname(host)
    527         if not port:
    528             import ftplib
    529             port = ftplib.FTP_PORT
    530         else:
    531             port = int(port)
    532         path, attrs = splitattr(path)
    533         path = unquote(path)
    534         dirs = path.split('/')
    535         dirs, file = dirs[:-1], dirs[-1]
    536         if dirs and not dirs[0]: dirs = dirs[1:]
    537         if dirs and not dirs[0]: dirs[0] = '/'
    538         key = user, host, port, '/'.join(dirs)
    539         # XXX thread unsafe!
    540         if len(self.ftpcache) > MAXFTPCACHE:
    541             # Prune the cache, rather arbitrarily
    542             for k in self.ftpcache.keys():
    543                 if k != key:
    544                     v = self.ftpcache[k]
    545                     del self.ftpcache[k]
    546                     v.close()
    547         try:
    548             if not key in self.ftpcache:
    549                 self.ftpcache[key] = \
    550                     ftpwrapper(user, passwd, host, port, dirs)
    551             if not file: type = 'D'
    552             else: type = 'I'
    553             for attr in attrs:
    554                 attr, value = splitvalue(attr)
    555                 if attr.lower() == 'type' and \
    556                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
    557                     type = value.upper()
    558             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
    559             mtype = mimetypes.guess_type("ftp:" + url)[0]
    560             headers = ""
    561             if mtype:
    562                 headers += "Content-Type: %s\n" % mtype
    563             if retrlen is not None and retrlen >= 0:
    564                 headers += "Content-Length: %d\n" % retrlen
    565             headers = mimetools.Message(StringIO(headers))
    566             return addinfourl(fp, headers, "ftp:" + url)
    567         except ftperrors(), msg:
    568             raise IOError, ('ftp error', msg), sys.exc_info()[2]
    569 
    570     def open_data(self, url, data=None):
    571         """Use "data" URL."""
    572         if not isinstance(url, str):
    573             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
    574         # ignore POSTed data
    575         #
    576         # syntax of data URLs:
    577         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    578         # mediatype := [ type "/" subtype ] *( ";" parameter )
    579         # data      := *urlchar
    580         # parameter := attribute "=" value
    581         import mimetools
    582         try:
    583             from cStringIO import StringIO
    584         except ImportError:
    585             from StringIO import StringIO
    586         try:
    587             [type, data] = url.split(',', 1)
    588         except ValueError:
    589             raise IOError, ('data error', 'bad data URL')
    590         if not type:
    591             type = 'text/plain;charset=US-ASCII'
    592         semi = type.rfind(';')
    593         if semi >= 0 and '=' not in type[semi:]:
    594             encoding = type[semi+1:]
    595             type = type[:semi]
    596         else:
    597             encoding = ''
    598         msg = []
    599         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
    600                                             time.gmtime(time.time())))
    601         msg.append('Content-type: %s' % type)
    602         if encoding == 'base64':
    603             data = base64.decodestring(data)
    604         else:
    605             data = unquote(data)
    606         msg.append('Content-Length: %d' % len(data))
    607         msg.append('')
    608         msg.append(data)
    609         msg = '\n'.join(msg)
    610         f = StringIO(msg)
    611         headers = mimetools.Message(f, 0)
    612         #f.fileno = None     # needed for addinfourl
    613         return addinfourl(f, headers, url)
    614 
    615 
    616 class FancyURLopener(URLopener):
    617     """Derived class with handlers for errors we can handle (perhaps)."""
    618 
    619     def __init__(self, *args, **kwargs):
    620         URLopener.__init__(self, *args, **kwargs)
    621         self.auth_cache = {}
    622         self.tries = 0
    623         self.maxtries = 10
    624 
    625     def http_error_default(self, url, fp, errcode, errmsg, headers):
    626         """Default error handling -- don't raise an exception."""
    627         return addinfourl(fp, headers, "http:" + url, errcode)
    628 
    629     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
    630         """Error 302 -- relocated (temporarily)."""
    631         self.tries += 1
    632         try:
    633             if self.maxtries and self.tries >= self.maxtries:
    634                 if hasattr(self, "http_error_500"):
    635                     meth = self.http_error_500
    636                 else:
    637                     meth = self.http_error_default
    638                 return meth(url, fp, 500,
    639                             "Internal Server Error: Redirect Recursion",
    640                             headers)
    641             result = self.redirect_internal(url, fp, errcode, errmsg,
    642                                             headers, data)
    643             return result
    644         finally:
    645             self.tries = 0
    646 
    647     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
    648         if 'location' in headers:
    649             newurl = headers['location']
    650         elif 'uri' in headers:
    651             newurl = headers['uri']
    652         else:
    653             return
    654         fp.close()
    655         # In case the server sent a relative URL, join with original:
    656         newurl = basejoin(self.type + ":" + url, newurl)
    657 
    658         # For security reasons we do not allow redirects to protocols
    659         # other than HTTP, HTTPS or FTP.
    660         newurl_lower = newurl.lower()
    661         if not (newurl_lower.startswith('http://') or
    662                 newurl_lower.startswith('https://') or
    663                 newurl_lower.startswith('ftp://')):
    664             raise IOError('redirect error', errcode,
    665                           errmsg + " - Redirection to url '%s' is not allowed" %
    666                           newurl,
    667                           headers)
    668 
    669         return self.open(newurl)
    670 
    671     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
    672         """Error 301 -- also relocated (permanently)."""
    673         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    674 
    675     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
    676         """Error 303 -- also relocated (essentially identical to 302)."""
    677         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    678 
    679     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
    680         """Error 307 -- relocated, but turn POST into error."""
    681         if data is None:
    682             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
    683         else:
    684             return self.http_error_default(url, fp, errcode, errmsg, headers)
    685 
    686     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
    687         """Error 401 -- authentication required.
    688         This function supports Basic authentication only."""
    689         if not 'www-authenticate' in headers:
    690             URLopener.http_error_default(self, url, fp,
    691                                          errcode, errmsg, headers)
    692         stuff = headers['www-authenticate']
    693         import re
    694         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
    695         if not match:
    696             URLopener.http_error_default(self, url, fp,
    697                                          errcode, errmsg, headers)
    698         scheme, realm = match.groups()
    699         if scheme.lower() != 'basic':
    700             URLopener.http_error_default(self, url, fp,
    701                                          errcode, errmsg, headers)
    702         name = 'retry_' + self.type + '_basic_auth'
    703         if data is None:
    704             return getattr(self,name)(url, realm)
    705         else:
    706             return getattr(self,name)(url, realm, data)
    707 
    708     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
    709         """Error 407 -- proxy authentication required.
    710         This function supports Basic authentication only."""
    711         if not 'proxy-authenticate' in headers:
    712             URLopener.http_error_default(self, url, fp,
    713                                          errcode, errmsg, headers)
    714         stuff = headers['proxy-authenticate']
    715         import re
    716         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
    717         if not match:
    718             URLopener.http_error_default(self, url, fp,
    719                                          errcode, errmsg, headers)
    720         scheme, realm = match.groups()
    721         if scheme.lower() != 'basic':
    722             URLopener.http_error_default(self, url, fp,
    723                                          errcode, errmsg, headers)
    724         name = 'retry_proxy_' + self.type + '_basic_auth'
    725         if data is None:
    726             return getattr(self,name)(url, realm)
    727         else:
    728             return getattr(self,name)(url, realm, data)
    729 
    730     def retry_proxy_http_basic_auth(self, url, realm, data=None):
    731         host, selector = splithost(url)
    732         newurl = 'http://' + host + selector
    733         proxy = self.proxies['http']
    734         urltype, proxyhost = splittype(proxy)
    735         proxyhost, proxyselector = splithost(proxyhost)
    736         i = proxyhost.find('@') + 1
    737         proxyhost = proxyhost[i:]
    738         user, passwd = self.get_user_passwd(proxyhost, realm, i)
    739         if not (user or passwd): return None
    740         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
    741         self.proxies['http'] = 'http://' + proxyhost + proxyselector
    742         if data is None:
    743             return self.open(newurl)
    744         else:
    745             return self.open(newurl, data)
    746 
    747     def retry_proxy_https_basic_auth(self, url, realm, data=None):
    748         host, selector = splithost(url)
    749         newurl = 'https://' + host + selector
    750         proxy = self.proxies['https']
    751         urltype, proxyhost = splittype(proxy)
    752         proxyhost, proxyselector = splithost(proxyhost)
    753         i = proxyhost.find('@') + 1
    754         proxyhost = proxyhost[i:]
    755         user, passwd = self.get_user_passwd(proxyhost, realm, i)
    756         if not (user or passwd): return None
    757         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
    758         self.proxies['https'] = 'https://' + proxyhost + proxyselector
    759         if data is None:
    760             return self.open(newurl)
    761         else:
    762             return self.open(newurl, data)
    763 
    764     def retry_http_basic_auth(self, url, realm, data=None):
    765         host, selector = splithost(url)
    766         i = host.find('@') + 1
    767         host = host[i:]
    768         user, passwd = self.get_user_passwd(host, realm, i)
    769         if not (user or passwd): return None
    770         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
    771         newurl = 'http://' + host + selector
    772         if data is None:
    773             return self.open(newurl)
    774         else:
    775             return self.open(newurl, data)
    776 
    777     def retry_https_basic_auth(self, url, realm, data=None):
    778         host, selector = splithost(url)
    779         i = host.find('@') + 1
    780         host = host[i:]
    781         user, passwd = self.get_user_passwd(host, realm, i)
    782         if not (user or passwd): return None
    783         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
    784         newurl = 'https://' + host + selector
    785         if data is None:
    786             return self.open(newurl)
    787         else:
    788             return self.open(newurl, data)
    789 
    790     def get_user_passwd(self, host, realm, clear_cache=0):
    791         key = realm + '@' + host.lower()
    792         if key in self.auth_cache:
    793             if clear_cache:
    794                 del self.auth_cache[key]
    795             else:
    796                 return self.auth_cache[key]
    797         user, passwd = self.prompt_user_passwd(host, realm)
    798         if user or passwd: self.auth_cache[key] = (user, passwd)
    799         return user, passwd
    800 
    801     def prompt_user_passwd(self, host, realm):
    802         """Override this in a GUI environment!"""
    803         import getpass
    804         try:
    805             user = raw_input("Enter username for %s at %s: " % (realm,
    806                                                                 host))
    807             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
    808                 (user, realm, host))
    809             return user, passwd
    810         except KeyboardInterrupt:
    811             print
    812             return None, None
    813 
    814 
    815 # Utility functions
    816 
    817 _localhost = None
    818 def localhost():
    819     """Return the IP address of the magic hostname 'localhost'."""
    820     global _localhost
    821     if _localhost is None:
    822         _localhost = socket.gethostbyname('localhost')
    823     return _localhost
    824 
    825 _thishost = None
    826 def thishost():
    827     """Return the IP address of the current host."""
    828     global _thishost
    829     if _thishost is None:
    830         try:
    831             _thishost = socket.gethostbyname(socket.gethostname())
    832         except socket.gaierror:
    833             _thishost = socket.gethostbyname('localhost')
    834     return _thishost
    835 
    836 _ftperrors = None
    837 def ftperrors():
    838     """Return the set of errors raised by the FTP class."""
    839     global _ftperrors
    840     if _ftperrors is None:
    841         import ftplib
    842         _ftperrors = ftplib.all_errors
    843     return _ftperrors
    844 
    845 _noheaders = None
    846 def noheaders():
    847     """Return an empty mimetools.Message object."""
    848     global _noheaders
    849     if _noheaders is None:
    850         import mimetools
    851         try:
    852             from cStringIO import StringIO
    853         except ImportError:
    854             from StringIO import StringIO
    855         _noheaders = mimetools.Message(StringIO(), 0)
    856         _noheaders.fp.close()   # Recycle file descriptor
    857     return _noheaders
    858 
    859 
    860 # Utility classes
    861 
    862 class ftpwrapper:
    863     """Class used by open_ftp() for cache of open FTP connections."""
    864 
    865     def __init__(self, user, passwd, host, port, dirs,
    866                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
    867                  persistent=True):
    868         self.user = user
    869         self.passwd = passwd
    870         self.host = host
    871         self.port = port
    872         self.dirs = dirs
    873         self.timeout = timeout
    874         self.refcount = 0
    875         self.keepalive = persistent
    876         try:
    877             self.init()
    878         except:
    879             self.close()
    880             raise
    881 
    882     def init(self):
    883         import ftplib
    884         self.busy = 0
    885         self.ftp = ftplib.FTP()
    886         self.ftp.connect(self.host, self.port, self.timeout)
    887         self.ftp.login(self.user, self.passwd)
    888         _target = '/'.join(self.dirs)
    889         self.ftp.cwd(_target)
    890 
    891     def retrfile(self, file, type):
    892         import ftplib
    893         self.endtransfer()
    894         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
    895         else: cmd = 'TYPE ' + type; isdir = 0
    896         try:
    897             self.ftp.voidcmd(cmd)
    898         except ftplib.all_errors:
    899             self.init()
    900             self.ftp.voidcmd(cmd)
    901         conn = None
    902         if file and not isdir:
    903             # Try to retrieve as a file
    904             try:
    905                 cmd = 'RETR ' + file
    906                 conn, retrlen = self.ftp.ntransfercmd(cmd)
    907             except ftplib.error_perm, reason:
    908                 if str(reason)[:3] != '550':
    909                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
    910         if not conn:
    911             # Set transfer mode to ASCII!
    912             self.ftp.voidcmd('TYPE A')
    913             # Try a directory listing. Verify that directory exists.
    914             if file:
    915                 pwd = self.ftp.pwd()
    916                 try:
    917                     try:
    918                         self.ftp.cwd(file)
    919                     except ftplib.error_perm, reason:
    920                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
    921                 finally:
    922                     self.ftp.cwd(pwd)
    923                 cmd = 'LIST ' + file
    924             else:
    925                 cmd = 'LIST'
    926             conn, retrlen = self.ftp.ntransfercmd(cmd)
    927         self.busy = 1
    928         ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
    929         self.refcount += 1
    930         conn.close()
    931         # Pass back both a suitably decorated object and a retrieval length
    932         return (ftpobj, retrlen)
    933 
    934     def endtransfer(self):
    935         self.busy = 0
    936 
    937     def close(self):
    938         self.keepalive = False
    939         if self.refcount <= 0:
    940             self.real_close()
    941 
    942     def file_close(self):
    943         self.endtransfer()
    944         self.refcount -= 1
    945         if self.refcount <= 0 and not self.keepalive:
    946             self.real_close()
    947 
    948     def real_close(self):
    949         self.endtransfer()
    950         try:
    951             self.ftp.close()
    952         except ftperrors():
    953             pass
    954 
    955 class addbase:
    956     """Base class for addinfo and addclosehook."""
    957 
    958     def __init__(self, fp):
    959         self.fp = fp
    960         self.read = self.fp.read
    961         self.readline = self.fp.readline
    962         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
    963         if hasattr(self.fp, "fileno"):
    964             self.fileno = self.fp.fileno
    965         else:
    966             self.fileno = lambda: None
    967         if hasattr(self.fp, "__iter__"):
    968             self.__iter__ = self.fp.__iter__
    969             if hasattr(self.fp, "next"):
    970                 self.next = self.fp.next
    971 
    972     def __repr__(self):
    973         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
    974                                              id(self), self.fp)
    975 
    976     def close(self):
    977         self.read = None
    978         self.readline = None
    979         self.readlines = None
    980         self.fileno = None
    981         if self.fp: self.fp.close()
    982         self.fp = None
    983 
    984 class addclosehook(addbase):
    985     """Class to add a close hook to an open file."""
    986 
    987     def __init__(self, fp, closehook, *hookargs):
    988         addbase.__init__(self, fp)
    989         self.closehook = closehook
    990         self.hookargs = hookargs
    991 
    992     def close(self):
    993         try:
    994             closehook = self.closehook
    995             hookargs = self.hookargs
    996             if closehook:
    997                 self.closehook = None
    998                 self.hookargs = None
    999                 closehook(*hookargs)
   1000         finally:
   1001             addbase.close(self)
   1002 
   1003 
   1004 class addinfo(addbase):
   1005     """class to add an info() method to an open file."""
   1006 
   1007     def __init__(self, fp, headers):
   1008         addbase.__init__(self, fp)
   1009         self.headers = headers
   1010 
   1011     def info(self):
   1012         return self.headers
   1013 
   1014 class addinfourl(addbase):
   1015     """class to add info() and geturl() methods to an open file."""
   1016 
   1017     def __init__(self, fp, headers, url, code=None):
   1018         addbase.__init__(self, fp)
   1019         self.headers = headers
   1020         self.url = url
   1021         self.code = code
   1022 
   1023     def info(self):
   1024         return self.headers
   1025 
   1026     def getcode(self):
   1027         return self.code
   1028 
   1029     def geturl(self):
   1030         return self.url
   1031 
   1032 
   1033 # Utilities to parse URLs (most of these return None for missing parts):
   1034 # unwrap('<URL:type://host/path>') --> 'type://host/path'
   1035 # splittype('type:opaquestring') --> 'type', 'opaquestring'
   1036 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
   1037 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
   1038 # splitpasswd('user:passwd') -> 'user', 'passwd'
   1039 # splitport('host:port') --> 'host', 'port'
   1040 # splitquery('/path?query') --> '/path', 'query'
   1041 # splittag('/path#tag') --> '/path', 'tag'
   1042 # splitattr('/path;attr1=value1;attr2=value2;...') ->
   1043 #   '/path', ['attr1=value1', 'attr2=value2', ...]
   1044 # splitvalue('attr=value') --> 'attr', 'value'
   1045 # unquote('abc%20def') -> 'abc def'
   1046 # quote('abc def') -> 'abc%20def')
   1047 
   1048 try:
   1049     unicode
   1050 except NameError:
   1051     def _is_unicode(x):
   1052         return 0
   1053 else:
   1054     def _is_unicode(x):
   1055         return isinstance(x, unicode)
   1056 
   1057 def toBytes(url):
   1058     """toBytes(u"URL") --> 'URL'."""
   1059     # Most URL schemes require ASCII. If that changes, the conversion
   1060     # can be relaxed
   1061     if _is_unicode(url):
   1062         try:
   1063             url = url.encode("ASCII")
   1064         except UnicodeError:
   1065             raise UnicodeError("URL " + repr(url) +
   1066                                " contains non-ASCII characters")
   1067     return url
   1068 
   1069 def unwrap(url):
   1070     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
   1071     url = url.strip()
   1072     if url[:1] == '<' and url[-1:] == '>':
   1073         url = url[1:-1].strip()
   1074     if url[:4] == 'URL:': url = url[4:].strip()
   1075     return url
   1076 
   1077 _typeprog = None
   1078 def splittype(url):
   1079     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
   1080     global _typeprog
   1081     if _typeprog is None:
   1082         import re
   1083         _typeprog = re.compile('^([^/:]+):')
   1084 
   1085     match = _typeprog.match(url)
   1086     if match:
   1087         scheme = match.group(1)
   1088         return scheme.lower(), url[len(scheme) + 1:]
   1089     return None, url
   1090 
   1091 _hostprog = None
   1092 def splithost(url):
   1093     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
   1094     global _hostprog
   1095     if _hostprog is None:
   1096         _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
   1097 
   1098     match = _hostprog.match(url)
   1099     if match:
   1100         host_port = match.group(1)
   1101         path = match.group(2)
   1102         if path and not path.startswith('/'):
   1103             path = '/' + path
   1104         return host_port, path
   1105     return None, url
   1106 
   1107 _userprog = None
   1108 def splituser(host):
   1109     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
   1110     global _userprog
   1111     if _userprog is None:
   1112         import re
   1113         _userprog = re.compile('^(.*)@(.*)$')
   1114 
   1115     match = _userprog.match(host)
   1116     if match: return match.group(1, 2)
   1117     return None, host
   1118 
   1119 _passwdprog = None
   1120 def splitpasswd(user):
   1121     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
   1122     global _passwdprog
   1123     if _passwdprog is None:
   1124         import re
   1125         _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
   1126 
   1127     match = _passwdprog.match(user)
   1128     if match: return match.group(1, 2)
   1129     return user, None
   1130 
   1131 # splittag('/path#tag') --> '/path', 'tag'
   1132 _portprog = None
   1133 def splitport(host):
   1134     """splitport('host:port') --> 'host', 'port'."""
   1135     global _portprog
   1136     if _portprog is None:
   1137         import re
   1138         _portprog = re.compile('^(.*):([0-9]*)$')
   1139 
   1140     match = _portprog.match(host)
   1141     if match:
   1142         host, port = match.groups()
   1143         if port:
   1144             return host, port
   1145     return host, None
   1146 
   1147 _nportprog = None
   1148 def splitnport(host, defport=-1):
   1149     """Split host and port, returning numeric port.
   1150     Return given default port if no ':' found; defaults to -1.
   1151     Return numerical port if a valid number are found after ':'.
   1152     Return None if ':' but not a valid number."""
   1153     global _nportprog
   1154     if _nportprog is None:
   1155         import re
   1156         _nportprog = re.compile('^(.*):(.*)$')
   1157 
   1158     match = _nportprog.match(host)
   1159     if match:
   1160         host, port = match.group(1, 2)
   1161         if port:
   1162             try:
   1163                 nport = int(port)
   1164             except ValueError:
   1165                 nport = None
   1166             return host, nport
   1167     return host, defport
   1168 
   1169 _queryprog = None
   1170 def splitquery(url):
   1171     """splitquery('/path?query') --> '/path', 'query'."""
   1172     global _queryprog
   1173     if _queryprog is None:
   1174         import re
   1175         _queryprog = re.compile('^(.*)\?([^?]*)$')
   1176 
   1177     match = _queryprog.match(url)
   1178     if match: return match.group(1, 2)
   1179     return url, None
   1180 
   1181 _tagprog = None
   1182 def splittag(url):
   1183     """splittag('/path#tag') --> '/path', 'tag'."""
   1184     global _tagprog
   1185     if _tagprog is None:
   1186         import re
   1187         _tagprog = re.compile('^(.*)#([^#]*)$')
   1188 
   1189     match = _tagprog.match(url)
   1190     if match: return match.group(1, 2)
   1191     return url, None
   1192 
   1193 def splitattr(url):
   1194     """splitattr('/path;attr1=value1;attr2=value2;...') ->
   1195         '/path', ['attr1=value1', 'attr2=value2', ...]."""
   1196     words = url.split(';')
   1197     return words[0], words[1:]
   1198 
   1199 _valueprog = None
   1200 def splitvalue(attr):
   1201     """splitvalue('attr=value') --> 'attr', 'value'."""
   1202     global _valueprog
   1203     if _valueprog is None:
   1204         import re
   1205         _valueprog = re.compile('^([^=]*)=(.*)$')
   1206 
   1207     match = _valueprog.match(attr)
   1208     if match: return match.group(1, 2)
   1209     return attr, None
   1210 
   1211 # urlparse contains a duplicate of this method to avoid a circular import.  If
   1212 # you update this method, also update the copy in urlparse.  This code
   1213 # duplication does not exist in Python3.
   1214 
   1215 _hexdig = '0123456789ABCDEFabcdef'
   1216 _hextochr = dict((a + b, chr(int(a + b, 16)))
   1217                  for a in _hexdig for b in _hexdig)
   1218 _asciire = re.compile('([\x00-\x7f]+)')
   1219 
   1220 def unquote(s):
   1221     """unquote('abc%20def') -> 'abc def'."""
   1222     if _is_unicode(s):
   1223         if '%' not in s:
   1224             return s
   1225         bits = _asciire.split(s)
   1226         res = [bits[0]]
   1227         append = res.append
   1228         for i in range(1, len(bits), 2):
   1229             append(unquote(str(bits[i])).decode('latin1'))
   1230             append(bits[i + 1])
   1231         return ''.join(res)
   1232 
   1233     bits = s.split('%')
   1234     # fastpath
   1235     if len(bits) == 1:
   1236         return s
   1237     res = [bits[0]]
   1238     append = res.append
   1239     for item in bits[1:]:
   1240         try:
   1241             append(_hextochr[item[:2]])
   1242             append(item[2:])
   1243         except KeyError:
   1244             append('%')
   1245             append(item)
   1246     return ''.join(res)
   1247 
   1248 def unquote_plus(s):
   1249     """unquote('%7e/abc+def') -> '~/abc def'"""
   1250     s = s.replace('+', ' ')
   1251     return unquote(s)
   1252 
   1253 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
   1254                'abcdefghijklmnopqrstuvwxyz'
   1255                '0123456789' '_.-')
   1256 _safe_map = {}
   1257 for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
   1258     _safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
   1259 _safe_quoters = {}
   1260 
   1261 def quote(s, safe='/'):
   1262     """quote('abc def') -> 'abc%20def'
   1263 
   1264     Each part of a URL, e.g. the path info, the query, etc., has a
   1265     different set of reserved characters that must be quoted.
   1266 
   1267     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
   1268     the following reserved characters.
   1269 
   1270     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
   1271                   "$" | ","
   1272 
   1273     Each of these characters is reserved in some component of a URL,
   1274     but not necessarily in all of them.
   1275 
   1276     By default, the quote function is intended for quoting the path
   1277     section of a URL.  Thus, it will not encode '/'.  This character
   1278     is reserved, but in typical usage the quote function is being
   1279     called on a path where the existing slash characters are used as
   1280     reserved characters.
   1281     """
   1282     # fastpath
   1283     if not s:
   1284         if s is None:
   1285             raise TypeError('None object cannot be quoted')
   1286         return s
   1287     cachekey = (safe, always_safe)
   1288     try:
   1289         (quoter, safe) = _safe_quoters[cachekey]
   1290     except KeyError:
   1291         safe_map = _safe_map.copy()
   1292         safe_map.update([(c, c) for c in safe])
   1293         quoter = safe_map.__getitem__
   1294         safe = always_safe + safe
   1295         _safe_quoters[cachekey] = (quoter, safe)
   1296     if not s.rstrip(safe):
   1297         return s
   1298     return ''.join(map(quoter, s))
   1299 
   1300 def quote_plus(s, safe=''):
   1301     """Quote the query fragment of a URL; replacing ' ' with '+'"""
   1302     if ' ' in s:
   1303         s = quote(s, safe + ' ')
   1304         return s.replace(' ', '+')
   1305     return quote(s, safe)
   1306 
   1307 def urlencode(query, doseq=0):
   1308     """Encode a sequence of two-element tuples or dictionary into a URL query string.
   1309 
   1310     If any values in the query arg are sequences and doseq is true, each
   1311     sequence element is converted to a separate parameter.
   1312 
   1313     If the query arg is a sequence of two-element tuples, the order of the
   1314     parameters in the output will match the order of parameters in the
   1315     input.
   1316     """
   1317 
   1318     if hasattr(query,"items"):
   1319         # mapping objects
   1320         query = query.items()
   1321     else:
   1322         # it's a bother at times that strings and string-like objects are
   1323         # sequences...
   1324         try:
   1325             # non-sequence items should not work with len()
   1326             # non-empty strings will fail this
   1327             if len(query) and not isinstance(query[0], tuple):
   1328                 raise TypeError
   1329             # zero-length sequences of all types will get here and succeed,
   1330             # but that's a minor nit - since the original implementation
   1331             # allowed empty dicts that type of behavior probably should be
   1332             # preserved for consistency
   1333         except TypeError:
   1334             ty,va,tb = sys.exc_info()
   1335             raise TypeError, "not a valid non-string sequence or mapping object", tb
   1336 
   1337     l = []
   1338     if not doseq:
   1339         # preserve old behavior
   1340         for k, v in query:
   1341             k = quote_plus(str(k))
   1342             v = quote_plus(str(v))
   1343             l.append(k + '=' + v)
   1344     else:
   1345         for k, v in query:
   1346             k = quote_plus(str(k))
   1347             if isinstance(v, str):
   1348                 v = quote_plus(v)
   1349                 l.append(k + '=' + v)
   1350             elif _is_unicode(v):
   1351                 # is there a reasonable way to convert to ASCII?
   1352                 # encode generates a string, but "replace" or "ignore"
   1353                 # lose information and "strict" can raise UnicodeError
   1354                 v = quote_plus(v.encode("ASCII","replace"))
   1355                 l.append(k + '=' + v)
   1356             else:
   1357                 try:
   1358                     # is this a sufficient test for sequence-ness?
   1359                     len(v)
   1360                 except TypeError:
   1361                     # not a sequence
   1362                     v = quote_plus(str(v))
   1363                     l.append(k + '=' + v)
   1364                 else:
   1365                     # loop over the sequence
   1366                     for elt in v:
   1367                         l.append(k + '=' + quote_plus(str(elt)))
   1368     return '&'.join(l)
   1369 
   1370 # Proxy handling
   1371 def getproxies_environment():
   1372     """Return a dictionary of scheme -> proxy server URL mappings.
   1373 
   1374     Scan the environment for variables named <scheme>_proxy;
   1375     this seems to be the standard convention.  In order to prefer lowercase
   1376     variables, we process the environment in two passes, first matches any
   1377     and second matches only lower case proxies.
   1378 
   1379     If you need a different way, you can pass a proxies dictionary to the
   1380     [Fancy]URLopener constructor.
   1381     """
   1382     # Get all variables
   1383     proxies = {}
   1384     for name, value in os.environ.items():
   1385         name = name.lower()
   1386         if value and name[-6:] == '_proxy':
   1387             proxies[name[:-6]] = value
   1388 
   1389     # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY
   1390     # (non-all-lowercase) as it may be set from the web server by a "Proxy:"
   1391     # header from the client
   1392     # If "proxy" is lowercase, it will still be used thanks to the next block
   1393     if 'REQUEST_METHOD' in os.environ:
   1394         proxies.pop('http', None)
   1395 
   1396     # Get lowercase variables
   1397     for name, value in os.environ.items():
   1398         if name[-6:] == '_proxy':
   1399             name = name.lower()
   1400             if value:
   1401                 proxies[name[:-6]] = value
   1402             else:
   1403                 proxies.pop(name[:-6], None)
   1404 
   1405     return proxies
   1406 
   1407 def proxy_bypass_environment(host, proxies=None):
   1408     """Test if proxies should not be used for a particular host.
   1409 
   1410     Checks the proxies dict for the value of no_proxy, which should be a
   1411     list of comma separated DNS suffixes, or '*' for all hosts.
   1412     """
   1413     if proxies is None:
   1414         proxies = getproxies_environment()
   1415     # don't bypass, if no_proxy isn't specified
   1416     try:
   1417         no_proxy = proxies['no']
   1418     except KeyError:
   1419         return 0
   1420     # '*' is special case for always bypass
   1421     if no_proxy == '*':
   1422         return 1
   1423     # strip port off host
   1424     hostonly, port = splitport(host)
   1425     # check if the host ends with any of the DNS suffixes
   1426     no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
   1427     for name in no_proxy_list:
   1428         if name:
   1429             name = name.lstrip('.')  # ignore leading dots
   1430             name = re.escape(name)
   1431             pattern = r'(.+\.)?%s$' % name
   1432             if (re.match(pattern, hostonly, re.I)
   1433                     or re.match(pattern, host, re.I)):
   1434                 return 1
   1435     # otherwise, don't bypass
   1436     return 0
   1437 
   1438 
   1439 if sys.platform == 'darwin':
   1440     from _scproxy import _get_proxy_settings, _get_proxies
   1441 
   1442     def proxy_bypass_macosx_sysconf(host):
   1443         """
   1444         Return True iff this host shouldn't be accessed using a proxy
   1445 
   1446         This function uses the MacOSX framework SystemConfiguration
   1447         to fetch the proxy information.
   1448         """
   1449         import re
   1450         import socket
   1451         from fnmatch import fnmatch
   1452 
   1453         hostonly, port = splitport(host)
   1454 
   1455         def ip2num(ipAddr):
   1456             parts = ipAddr.split('.')
   1457             parts = map(int, parts)
   1458             if len(parts) != 4:
   1459                 parts = (parts + [0, 0, 0, 0])[:4]
   1460             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
   1461 
   1462         proxy_settings = _get_proxy_settings()
   1463 
   1464         # Check for simple host names:
   1465         if '.' not in host:
   1466             if proxy_settings['exclude_simple']:
   1467                 return True
   1468 
   1469         hostIP = None
   1470 
   1471         for value in proxy_settings.get('exceptions', ()):
   1472             # Items in the list are strings like these: *.local, 169.254/16
   1473             if not value: continue
   1474 
   1475             m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
   1476             if m is not None:
   1477                 if hostIP is None:
   1478                     try:
   1479                         hostIP = socket.gethostbyname(hostonly)
   1480                         hostIP = ip2num(hostIP)
   1481                     except socket.error:
   1482                         continue
   1483 
   1484                 base = ip2num(m.group(1))
   1485                 mask = m.group(2)
   1486                 if mask is None:
   1487                     mask = 8 * (m.group(1).count('.') + 1)
   1488 
   1489                 else:
   1490                     mask = int(mask[1:])
   1491                 mask = 32 - mask
   1492 
   1493                 if (hostIP >> mask) == (base >> mask):
   1494                     return True
   1495 
   1496             elif fnmatch(host, value):
   1497                 return True
   1498 
   1499         return False
   1500 
   1501     def getproxies_macosx_sysconf():
   1502         """Return a dictionary of scheme -> proxy server URL mappings.
   1503 
   1504         This function uses the MacOSX framework SystemConfiguration
   1505         to fetch the proxy information.
   1506         """
   1507         return _get_proxies()
   1508 
   1509     def proxy_bypass(host):
   1510         """Return True, if a host should be bypassed.
   1511 
   1512         Checks proxy settings gathered from the environment, if specified, or
   1513         from the MacOSX framework SystemConfiguration.
   1514         """
   1515         proxies = getproxies_environment()
   1516         if proxies:
   1517             return proxy_bypass_environment(host, proxies)
   1518         else:
   1519             return proxy_bypass_macosx_sysconf(host)
   1520 
   1521     def getproxies():
   1522         return getproxies_environment() or getproxies_macosx_sysconf()
   1523 
   1524 elif os.name == 'nt':
   1525     def getproxies_registry():
   1526         """Return a dictionary of scheme -> proxy server URL mappings.
   1527 
   1528         Win32 uses the registry to store proxies.
   1529 
   1530         """
   1531         proxies = {}
   1532         try:
   1533             import _winreg
   1534         except ImportError:
   1535             # Std module, so should be around - but you never know!
   1536             return proxies
   1537         try:
   1538             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
   1539                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
   1540             proxyEnable = _winreg.QueryValueEx(internetSettings,
   1541                                                'ProxyEnable')[0]
   1542             if proxyEnable:
   1543                 # Returned as Unicode but problems if not converted to ASCII
   1544                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
   1545                                                        'ProxyServer')[0])
   1546                 if '=' in proxyServer:
   1547                     # Per-protocol settings
   1548                     for p in proxyServer.split(';'):
   1549                         protocol, address = p.split('=', 1)
   1550                         # See if address has a type:// prefix
   1551                         import re
   1552                         if not re.match('^([^/:]+)://', address):
   1553                             address = '%s://%s' % (protocol, address)
   1554                         proxies[protocol] = address
   1555                 else:
   1556                     # Use one setting for all protocols
   1557                     if proxyServer[:5] == 'http:':
   1558                         proxies['http'] = proxyServer
   1559                     else:
   1560                         proxies['http'] = 'http://%s' % proxyServer
   1561                         proxies['https'] = 'https://%s' % proxyServer
   1562                         proxies['ftp'] = 'ftp://%s' % proxyServer
   1563             internetSettings.Close()
   1564         except (WindowsError, ValueError, TypeError):
   1565             # Either registry key not found etc, or the value in an
   1566             # unexpected format.
   1567             # proxies already set up to be empty so nothing to do
   1568             pass
   1569         return proxies
   1570 
   1571     def getproxies():
   1572         """Return a dictionary of scheme -> proxy server URL mappings.
   1573 
   1574         Returns settings gathered from the environment, if specified,
   1575         or the registry.
   1576 
   1577         """
   1578         return getproxies_environment() or getproxies_registry()
   1579 
   1580     def proxy_bypass_registry(host):
   1581         try:
   1582             import _winreg
   1583             import re
   1584         except ImportError:
   1585             # Std modules, so should be around - but you never know!
   1586             return 0
   1587         try:
   1588             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
   1589                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
   1590             proxyEnable = _winreg.QueryValueEx(internetSettings,
   1591                                                'ProxyEnable')[0]
   1592             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
   1593                                                      'ProxyOverride')[0])
   1594             # ^^^^ Returned as Unicode but problems if not converted to ASCII
   1595         except WindowsError:
   1596             return 0
   1597         if not proxyEnable or not proxyOverride:
   1598             return 0
   1599         # try to make a host list from name and IP address.
   1600         rawHost, port = splitport(host)
   1601         host = [rawHost]
   1602         try:
   1603             addr = socket.gethostbyname(rawHost)
   1604             if addr != rawHost:
   1605                 host.append(addr)
   1606         except socket.error:
   1607             pass
   1608         try:
   1609             fqdn = socket.getfqdn(rawHost)
   1610             if fqdn != rawHost:
   1611                 host.append(fqdn)
   1612         except socket.error:
   1613             pass
   1614         # make a check value list from the registry entry: replace the
   1615         # '<local>' string by the localhost entry and the corresponding
   1616         # canonical entry.
   1617         proxyOverride = proxyOverride.split(';')
   1618         # now check if we match one of the registry values.
   1619         for test in proxyOverride:
   1620             if test == '<local>':
   1621                 if '.' not in rawHost:
   1622                     return 1
   1623             test = test.replace(".", r"\.")     # mask dots
   1624             test = test.replace("*", r".*")     # change glob sequence
   1625             test = test.replace("?", r".")      # change glob char
   1626             for val in host:
   1627                 # print "%s <--> %s" %( test, val )
   1628                 if re.match(test, val, re.I):
   1629                     return 1
   1630         return 0
   1631 
   1632     def proxy_bypass(host):
   1633         """Return True, if the host should be bypassed.
   1634 
   1635         Checks proxy settings gathered from the environment, if specified,
   1636         or the registry.
   1637         """
   1638         proxies = getproxies_environment()
   1639         if proxies:
   1640             return proxy_bypass_environment(host, proxies)
   1641         else:
   1642             return proxy_bypass_registry(host)
   1643 
   1644 else:
   1645     # By default use environment variables
   1646     getproxies = getproxies_environment
   1647     proxy_bypass = proxy_bypass_environment
   1648 
   1649 # Test and time quote() and unquote()
   1650 def test1():
   1651     s = ''
   1652     for i in range(256): s = s + chr(i)
   1653     s = s*4
   1654     t0 = time.time()
   1655     qs = quote(s)
   1656     uqs = unquote(qs)
   1657     t1 = time.time()
   1658     if uqs != s:
   1659         print 'Wrong!'
   1660     print repr(s)
   1661     print repr(qs)
   1662     print repr(uqs)
   1663     print round(t1 - t0, 3), 'sec'
   1664 
   1665 
   1666 def reporthook(blocknum, blocksize, totalsize):
   1667     # Report during remote transfers
   1668     print "Block number: %d, Block size: %d, Total size: %d" % (
   1669         blocknum, blocksize, totalsize)
   1670