Home | History | Annotate | Download | only in Lib
      1 """An extensible library for opening URLs using a variety of protocols
      2 
      3 The simplest way to use this module is to call the urlopen function,
      4 which accepts a string containing a URL or a Request object (described
      5 below).  It opens the URL and returns the results as file-like
      6 object; the returned object has some extra methods described below.
      7 
      8 The OpenerDirector manages a collection of Handler objects that do
      9 all the actual work.  Each Handler implements a particular protocol or
     10 option.  The OpenerDirector is a composite object that invokes the
     11 Handlers needed to open the requested URL.  For example, the
     12 HTTPHandler performs HTTP GET and POST requests and deals with
     13 non-error returns.  The HTTPRedirectHandler automatically deals with
     14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
     15 deals with digest authentication.
     16 
     17 urlopen(url, data=None) -- Basic usage is the same as original
     18 urllib.  pass the url and optionally data to post to an HTTP URL, and
     19 get a file-like object back.  One difference is that you can also pass
     20 a Request instance instead of URL.  Raises a URLError (subclass of
     21 IOError); for HTTP errors, raises an HTTPError, which can also be
     22 treated as a valid response.
     23 
     24 build_opener -- Function that creates a new OpenerDirector instance.
     25 Will install the default handlers.  Accepts one or more Handlers as
     26 arguments, either instances or Handler classes that it will
     27 instantiate.  If one of the argument is a subclass of the default
     28 handler, the argument will be installed instead of the default.
     29 
     30 install_opener -- Installs a new opener as the default opener.
     31 
     32 objects of interest:
     33 
     34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
     35 the Handler classes, while dealing with requests and responses.
     36 
     37 Request -- An object that encapsulates the state of a request.  The
     38 state can be as simple as the URL.  It can also include extra HTTP
     39 headers, e.g. a User-Agent.
     40 
     41 BaseHandler --
     42 
     43 exceptions:
     44 URLError -- A subclass of IOError, individual protocols have their own
     45 specific subclass.
     46 
     47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
     48 as an exceptional event or valid response.
     49 
     50 internals:
     51 BaseHandler and parent
     52 _call_chain conventions
     53 
     54 Example usage:
     55 
     56 import urllib2
     57 
     58 # set up authentication info
     59 authinfo = urllib2.HTTPBasicAuthHandler()
     60 authinfo.add_password(realm='PDQ Application',
     61                       uri='https://mahler:8092/site-updates.py',
     62                       user='klem',
     63                       passwd='geheim$parole')
     64 
     65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
     66 
     67 # build a new opener that adds authentication and caching FTP handlers
     68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
     69 
     70 # install it
     71 urllib2.install_opener(opener)
     72 
     73 f = urllib2.urlopen('http://www.python.org/')
     74 
     75 
     76 """
     77 
     78 # XXX issues:

     79 # If an authentication error handler that tries to perform

     80 # authentication for some reason but fails, how should the error be

     81 # signalled?  The client needs to know the HTTP error code.  But if

     82 # the handler knows that the problem was, e.g., that it didn't know

     83 # that hash algo that requested in the challenge, it would be good to

     84 # pass that information along to the client, too.

     85 # ftp errors aren't handled cleanly

     86 # check digest against correct (i.e. non-apache) implementation

     87 
     88 # Possible extensions:

     89 # complex proxies  XXX not sure what exactly was meant by this

     90 # abstract factory for opener

     91 
     92 import base64
     93 import hashlib
     94 import httplib
     95 import mimetools
     96 import os
     97 import posixpath
     98 import random
     99 import re
    100 import socket
    101 import sys
    102 import time
    103 import urlparse
    104 import bisect
    105 
    106 try:
    107     from cStringIO import StringIO
    108 except ImportError:
    109     from StringIO import StringIO
    110 
    111 from urllib import (unwrap, unquote, splittype, splithost, quote,
    112      addinfourl, splitport, splittag,
    113      splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
    114 
    115 # support for FileHandler, proxies via environment variables

    116 from urllib import localhost, url2pathname, getproxies, proxy_bypass
    117 
    118 # used in User-Agent header sent

    119 __version__ = sys.version[:3]
    120 
    121 _opener = None
    122 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    123     global _opener
    124     if _opener is None:
    125         _opener = build_opener()
    126     return _opener.open(url, data, timeout)
    127 
    128 def install_opener(opener):
    129     global _opener
    130     _opener = opener
    131 
    132 # do these error classes make sense?

    133 # make sure all of the IOError stuff is overridden.  we just want to be

    134 # subtypes.

    135 
    136 class URLError(IOError):
    137     # URLError is a sub-type of IOError, but it doesn't share any of

    138     # the implementation.  need to override __init__ and __str__.

    139     # It sets self.args for compatibility with other EnvironmentError

    140     # subclasses, but args doesn't have the typical format with errno in

    141     # slot 0 and strerror in slot 1.  This may be better than nothing.

    142     def __init__(self, reason):
    143         self.args = reason,
    144         self.reason = reason
    145 
    146     def __str__(self):
    147         return '<urlopen error %s>' % self.reason
    148 
    149 class HTTPError(URLError, addinfourl):
    150     """Raised when HTTP error occurs, but also acts like non-error return"""
    151     __super_init = addinfourl.__init__
    152 
    153     def __init__(self, url, code, msg, hdrs, fp):
    154         self.code = code
    155         self.msg = msg
    156         self.hdrs = hdrs
    157         self.fp = fp
    158         self.filename = url
    159         # The addinfourl classes depend on fp being a valid file

    160         # object.  In some cases, the HTTPError may not have a valid

    161         # file object.  If this happens, the simplest workaround is to

    162         # not initialize the base classes.

    163         if fp is not None:
    164             self.__super_init(fp, hdrs, url, code)
    165 
    166     def __str__(self):
    167         return 'HTTP Error %s: %s' % (self.code, self.msg)
    168 
    169 # copied from cookielib.py

    170 _cut_port_re = re.compile(r":\d+$")
    171 def request_host(request):
    172     """Return request-host, as defined by RFC 2965.
    173 
    174     Variation from RFC: returned value is lowercased, for convenient
    175     comparison.
    176 
    177     """
    178     url = request.get_full_url()
    179     host = urlparse.urlparse(url)[1]
    180     if host == "":
    181         host = request.get_header("Host", "")
    182 
    183     # remove port, if present

    184     host = _cut_port_re.sub("", host, 1)
    185     return host.lower()
    186 
    187 class Request:
    188 
    189     def __init__(self, url, data=None, headers={},
    190                  origin_req_host=None, unverifiable=False):
    191         # unwrap('<URL:type://host/path>') --> 'type://host/path'

    192         self.__original = unwrap(url)
    193         self.__original, self.__fragment = splittag(self.__original)
    194         self.type = None
    195         # self.__r_type is what's left after doing the splittype

    196         self.host = None
    197         self.port = None
    198         self._tunnel_host = None
    199         self.data = data
    200         self.headers = {}
    201         for key, value in headers.items():
    202             self.add_header(key, value)
    203         self.unredirected_hdrs = {}
    204         if origin_req_host is None:
    205             origin_req_host = request_host(self)
    206         self.origin_req_host = origin_req_host
    207         self.unverifiable = unverifiable
    208 
    209     def __getattr__(self, attr):
    210         # XXX this is a fallback mechanism to guard against these

    211         # methods getting called in a non-standard order.  this may be

    212         # too complicated and/or unnecessary.

    213         # XXX should the __r_XXX attributes be public?

    214         if attr[:12] == '_Request__r_':
    215             name = attr[12:]
    216             if hasattr(Request, 'get_' + name):
    217                 getattr(self, 'get_' + name)()
    218                 return getattr(self, attr)
    219         raise AttributeError, attr
    220 
    221     def get_method(self):
    222         if self.has_data():
    223             return "POST"
    224         else:
    225             return "GET"
    226 
    227     # XXX these helper methods are lame

    228 
    229     def add_data(self, data):
    230         self.data = data
    231 
    232     def has_data(self):
    233         return self.data is not None
    234 
    235     def get_data(self):
    236         return self.data
    237 
    238     def get_full_url(self):
    239         if self.__fragment:
    240             return '%s#%s' % (self.__original, self.__fragment)
    241         else:
    242             return self.__original
    243 
    244     def get_type(self):
    245         if self.type is None:
    246             self.type, self.__r_type = splittype(self.__original)
    247             if self.type is None:
    248                 raise ValueError, "unknown url type: %s" % self.__original
    249         return self.type
    250 
    251     def get_host(self):
    252         if self.host is None:
    253             self.host, self.__r_host = splithost(self.__r_type)
    254             if self.host:
    255                 self.host = unquote(self.host)
    256         return self.host
    257 
    258     def get_selector(self):
    259         return self.__r_host
    260 
    261     def set_proxy(self, host, type):
    262         if self.type == 'https' and not self._tunnel_host:
    263             self._tunnel_host = self.host
    264         else:
    265             self.type = type
    266             self.__r_host = self.__original
    267 
    268         self.host = host
    269 
    270     def has_proxy(self):
    271         return self.__r_host == self.__original
    272 
    273     def get_origin_req_host(self):
    274         return self.origin_req_host
    275 
    276     def is_unverifiable(self):
    277         return self.unverifiable
    278 
    279     def add_header(self, key, val):
    280         # useful for something like authentication

    281         self.headers[key.capitalize()] = val
    282 
    283     def add_unredirected_header(self, key, val):
    284         # will not be added to a redirected request

    285         self.unredirected_hdrs[key.capitalize()] = val
    286 
    287     def has_header(self, header_name):
    288         return (header_name in self.headers or
    289                 header_name in self.unredirected_hdrs)
    290 
    291     def get_header(self, header_name, default=None):
    292         return self.headers.get(
    293             header_name,
    294             self.unredirected_hdrs.get(header_name, default))
    295 
    296     def header_items(self):
    297         hdrs = self.unredirected_hdrs.copy()
    298         hdrs.update(self.headers)
    299         return hdrs.items()
    300 
    301 class OpenerDirector:
    302     def __init__(self):
    303         client_version = "Python-urllib/%s" % __version__
    304         self.addheaders = [('User-agent', client_version)]
    305         # self.handlers is retained only for backward compatibility

    306         self.handlers = []
    307         # manage the individual handlers

    308         self.handle_open = {}
    309         self.handle_error = {}
    310         self.process_response = {}
    311         self.process_request = {}
    312 
    313     def add_handler(self, handler):
    314         if not hasattr(handler, "add_parent"):
    315             raise TypeError("expected BaseHandler instance, got %r" %
    316                             type(handler))
    317 
    318         added = False
    319         for meth in dir(handler):
    320             if meth in ["redirect_request", "do_open", "proxy_open"]:
    321                 # oops, coincidental match

    322                 continue
    323 
    324             i = meth.find("_")
    325             protocol = meth[:i]
    326             condition = meth[i+1:]
    327 
    328             if condition.startswith("error"):
    329                 j = condition.find("_") + i + 1
    330                 kind = meth[j+1:]
    331                 try:
    332                     kind = int(kind)
    333                 except ValueError:
    334                     pass
    335                 lookup = self.handle_error.get(protocol, {})
    336                 self.handle_error[protocol] = lookup
    337             elif condition == "open":
    338                 kind = protocol
    339                 lookup = self.handle_open
    340             elif condition == "response":
    341                 kind = protocol
    342                 lookup = self.process_response
    343             elif condition == "request":
    344                 kind = protocol
    345                 lookup = self.process_request
    346             else:
    347                 continue
    348 
    349             handlers = lookup.setdefault(kind, [])
    350             if handlers:
    351                 bisect.insort(handlers, handler)
    352             else:
    353                 handlers.append(handler)
    354             added = True
    355 
    356         if added:
    357             bisect.insort(self.handlers, handler)
    358             handler.add_parent(self)
    359 
    360     def close(self):
    361         # Only exists for backwards compatibility.

    362         pass
    363 
    364     def _call_chain(self, chain, kind, meth_name, *args):
    365         # Handlers raise an exception if no one else should try to handle

    366         # the request, or return None if they can't but another handler

    367         # could.  Otherwise, they return the response.

    368         handlers = chain.get(kind, ())
    369         for handler in handlers:
    370             func = getattr(handler, meth_name)
    371 
    372             result = func(*args)
    373             if result is not None:
    374                 return result
    375 
    376     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    377         # accept a URL or a Request object

    378         if isinstance(fullurl, basestring):
    379             req = Request(fullurl, data)
    380         else:
    381             req = fullurl
    382             if data is not None:
    383                 req.add_data(data)
    384 
    385         req.timeout = timeout
    386         protocol = req.get_type()
    387 
    388         # pre-process request

    389         meth_name = protocol+"_request"
    390         for processor in self.process_request.get(protocol, []):
    391             meth = getattr(processor, meth_name)
    392             req = meth(req)
    393 
    394         response = self._open(req, data)
    395 
    396         # post-process response

    397         meth_name = protocol+"_response"
    398         for processor in self.process_response.get(protocol, []):
    399             meth = getattr(processor, meth_name)
    400             response = meth(req, response)
    401 
    402         return response
    403 
    404     def _open(self, req, data=None):
    405         result = self._call_chain(self.handle_open, 'default',
    406                                   'default_open', req)
    407         if result:
    408             return result
    409 
    410         protocol = req.get_type()
    411         result = self._call_chain(self.handle_open, protocol, protocol +
    412                                   '_open', req)
    413         if result:
    414             return result
    415 
    416         return self._call_chain(self.handle_open, 'unknown',
    417                                 'unknown_open', req)
    418 
    419     def error(self, proto, *args):
    420         if proto in ('http', 'https'):
    421             # XXX http[s] protocols are special-cased

    422             dict = self.handle_error['http'] # https is not different than http

    423             proto = args[2]  # YUCK!

    424             meth_name = 'http_error_%s' % proto
    425             http_err = 1
    426             orig_args = args
    427         else:
    428             dict = self.handle_error
    429             meth_name = proto + '_error'
    430             http_err = 0
    431         args = (dict, proto, meth_name) + args
    432         result = self._call_chain(*args)
    433         if result:
    434             return result
    435 
    436         if http_err:
    437             args = (dict, 'default', 'http_error_default') + orig_args
    438             return self._call_chain(*args)
    439 
    440 # XXX probably also want an abstract factory that knows when it makes

    441 # sense to skip a superclass in favor of a subclass and when it might

    442 # make sense to include both

    443 
    444 def build_opener(*handlers):
    445     """Create an opener object from a list of handlers.
    446 
    447     The opener will use several default handlers, including support
    448     for HTTP, FTP and when applicable, HTTPS.
    449 
    450     If any of the handlers passed as arguments are subclasses of the
    451     default handlers, the default handlers will not be used.
    452     """
    453     import types
    454     def isclass(obj):
    455         return isinstance(obj, (types.ClassType, type))
    456 
    457     opener = OpenerDirector()
    458     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
    459                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
    460                        FTPHandler, FileHandler, HTTPErrorProcessor]
    461     if hasattr(httplib, 'HTTPS'):
    462         default_classes.append(HTTPSHandler)
    463     skip = set()
    464     for klass in default_classes:
    465         for check in handlers:
    466             if isclass(check):
    467                 if issubclass(check, klass):
    468                     skip.add(klass)
    469             elif isinstance(check, klass):
    470                 skip.add(klass)
    471     for klass in skip:
    472         default_classes.remove(klass)
    473 
    474     for klass in default_classes:
    475         opener.add_handler(klass())
    476 
    477     for h in handlers:
    478         if isclass(h):
    479             h = h()
    480         opener.add_handler(h)
    481     return opener
    482 
    483 class BaseHandler:
    484     handler_order = 500
    485 
    486     def add_parent(self, parent):
    487         self.parent = parent
    488 
    489     def close(self):
    490         # Only exists for backwards compatibility

    491         pass
    492 
    493     def __lt__(self, other):
    494         if not hasattr(other, "handler_order"):
    495             # Try to preserve the old behavior of having custom classes

    496             # inserted after default ones (works only for custom user

    497             # classes which are not aware of handler_order).

    498             return True
    499         return self.handler_order < other.handler_order
    500 
    501 
    502 class HTTPErrorProcessor(BaseHandler):
    503     """Process HTTP error responses."""
    504     handler_order = 1000  # after all other processing

    505 
    506     def http_response(self, request, response):
    507         code, msg, hdrs = response.code, response.msg, response.info()
    508 
    509         # According to RFC 2616, "2xx" code indicates that the client's

    510         # request was successfully received, understood, and accepted.

    511         if not (200 <= code < 300):
    512             response = self.parent.error(
    513                 'http', request, response, code, msg, hdrs)
    514 
    515         return response
    516 
    517     https_response = http_response
    518 
    519 class HTTPDefaultErrorHandler(BaseHandler):
    520     def http_error_default(self, req, fp, code, msg, hdrs):
    521         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
    522 
    523 class HTTPRedirectHandler(BaseHandler):
    524     # maximum number of redirections to any single URL

    525     # this is needed because of the state that cookies introduce

    526     max_repeats = 4
    527     # maximum total number of redirections (regardless of URL) before

    528     # assuming we're in a loop

    529     max_redirections = 10
    530 
    531     def redirect_request(self, req, fp, code, msg, headers, newurl):
    532         """Return a Request or None in response to a redirect.
    533 
    534         This is called by the http_error_30x methods when a
    535         redirection response is received.  If a redirection should
    536         take place, return a new Request to allow http_error_30x to
    537         perform the redirect.  Otherwise, raise HTTPError if no-one
    538         else should try to handle this url.  Return None if you can't
    539         but another Handler might.
    540         """
    541         m = req.get_method()
    542         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
    543             or code in (301, 302, 303) and m == "POST"):
    544             # Strictly (according to RFC 2616), 301 or 302 in response

    545             # to a POST MUST NOT cause a redirection without confirmation

    546             # from the user (of urllib2, in this case).  In practice,

    547             # essentially all clients do redirect in this case, so we

    548             # do the same.

    549             # be conciliant with URIs containing a space

    550             newurl = newurl.replace(' ', '%20')
    551             newheaders = dict((k,v) for k,v in req.headers.items()
    552                               if k.lower() not in ("content-length", "content-type")
    553                              )
    554             return Request(newurl,
    555                            headers=newheaders,
    556                            origin_req_host=req.get_origin_req_host(),
    557                            unverifiable=True)
    558         else:
    559             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
    560 
    561     # Implementation note: To avoid the server sending us into an

    562     # infinite loop, the request object needs to track what URLs we

    563     # have already seen.  Do this by adding a handler-specific

    564     # attribute to the Request object.

    565     def http_error_302(self, req, fp, code, msg, headers):
    566         # Some servers (incorrectly) return multiple Location headers

    567         # (so probably same goes for URI).  Use first header.

    568         if 'location' in headers:
    569             newurl = headers.getheaders('location')[0]
    570         elif 'uri' in headers:
    571             newurl = headers.getheaders('uri')[0]
    572         else:
    573             return
    574 
    575         # fix a possible malformed URL

    576         urlparts = urlparse.urlparse(newurl)
    577         if not urlparts.path:
    578             urlparts = list(urlparts)
    579             urlparts[2] = "/"
    580         newurl = urlparse.urlunparse(urlparts)
    581 
    582         newurl = urlparse.urljoin(req.get_full_url(), newurl)
    583 
    584         # For security reasons we do not allow redirects to protocols

    585         # other than HTTP, HTTPS or FTP.

    586         newurl_lower = newurl.lower()
    587         if not (newurl_lower.startswith('http://') or
    588                 newurl_lower.startswith('https://') or
    589                 newurl_lower.startswith('ftp://')):
    590             raise HTTPError(newurl, code,
    591                             msg + " - Redirection to url '%s' is not allowed" %
    592                             newurl,
    593                             headers, fp)
    594 
    595         # XXX Probably want to forget about the state of the current

    596         # request, although that might interact poorly with other

    597         # handlers that also use handler-specific request attributes

    598         new = self.redirect_request(req, fp, code, msg, headers, newurl)
    599         if new is None:
    600             return
    601 
    602         # loop detection

    603         # .redirect_dict has a key url if url was previously visited.

    604         if hasattr(req, 'redirect_dict'):
    605             visited = new.redirect_dict = req.redirect_dict
    606             if (visited.get(newurl, 0) >= self.max_repeats or
    607                 len(visited) >= self.max_redirections):
    608                 raise HTTPError(req.get_full_url(), code,
    609                                 self.inf_msg + msg, headers, fp)
    610         else:
    611             visited = new.redirect_dict = req.redirect_dict = {}
    612         visited[newurl] = visited.get(newurl, 0) + 1
    613 
    614         # Don't close the fp until we are sure that we won't use it

    615         # with HTTPError.

    616         fp.read()
    617         fp.close()
    618 
    619         return self.parent.open(new, timeout=req.timeout)
    620 
    621     http_error_301 = http_error_303 = http_error_307 = http_error_302
    622 
    623     inf_msg = "The HTTP server returned a redirect error that would " \
    624               "lead to an infinite loop.\n" \
    625               "The last 30x error message was:\n"
    626 
    627 
    628 def _parse_proxy(proxy):
    629     """Return (scheme, user, password, host/port) given a URL or an authority.
    630 
    631     If a URL is supplied, it must have an authority (host:port) component.
    632     According to RFC 3986, having an authority component means the URL must
    633     have two slashes after the scheme:
    634 
    635     >>> _parse_proxy('file:/ftp.example.com/')
    636     Traceback (most recent call last):
    637     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    638 
    639     The first three items of the returned tuple may be None.
    640 
    641     Examples of authority parsing:
    642 
    643     >>> _parse_proxy('proxy.example.com')
    644     (None, None, None, 'proxy.example.com')
    645     >>> _parse_proxy('proxy.example.com:3128')
    646     (None, None, None, 'proxy.example.com:3128')
    647 
    648     The authority component may optionally include userinfo (assumed to be
    649     username:password):
    650 
    651     >>> _parse_proxy('joe:password (at] proxy.example.com')
    652     (None, 'joe', 'password', 'proxy.example.com')
    653     >>> _parse_proxy('joe:password (at] proxy.example.com:3128')
    654     (None, 'joe', 'password', 'proxy.example.com:3128')
    655 
    656     Same examples, but with URLs instead:
    657 
    658     >>> _parse_proxy('http://proxy.example.com/')
    659     ('http', None, None, 'proxy.example.com')
    660     >>> _parse_proxy('http://proxy.example.com:3128/')
    661     ('http', None, None, 'proxy.example.com:3128')
    662     >>> _parse_proxy('http://joe:password@proxy.example.com/')
    663     ('http', 'joe', 'password', 'proxy.example.com')
    664     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    665     ('http', 'joe', 'password', 'proxy.example.com:3128')
    666 
    667     Everything after the authority is ignored:
    668 
    669     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    670     ('ftp', 'joe', 'password', 'proxy.example.com')
    671 
    672     Test for no trailing '/' case:
    673 
    674     >>> _parse_proxy('http://joe:password@proxy.example.com')
    675     ('http', 'joe', 'password', 'proxy.example.com')
    676 
    677     """
    678     scheme, r_scheme = splittype(proxy)
    679     if not r_scheme.startswith("/"):
    680         # authority

    681         scheme = None
    682         authority = proxy
    683     else:
    684         # URL

    685         if not r_scheme.startswith("//"):
    686             raise ValueError("proxy URL with no authority: %r" % proxy)
    687         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.

    688         # and 3.3.), path is empty or starts with '/'

    689         end = r_scheme.find("/", 2)
    690         if end == -1:
    691             end = None
    692         authority = r_scheme[2:end]
    693     userinfo, hostport = splituser(authority)
    694     if userinfo is not None:
    695         user, password = splitpasswd(userinfo)
    696     else:
    697         user = password = None
    698     return scheme, user, password, hostport
    699 
    700 class ProxyHandler(BaseHandler):
    701     # Proxies must be in front

    702     handler_order = 100
    703 
    704     def __init__(self, proxies=None):
    705         if proxies is None:
    706             proxies = getproxies()
    707         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
    708         self.proxies = proxies
    709         for type, url in proxies.items():
    710             setattr(self, '%s_open' % type,
    711                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
    712                     meth(r, proxy, type))
    713 
    714     def proxy_open(self, req, proxy, type):
    715         orig_type = req.get_type()
    716         proxy_type, user, password, hostport = _parse_proxy(proxy)
    717 
    718         if proxy_type is None:
    719             proxy_type = orig_type
    720 
    721         if req.host and proxy_bypass(req.host):
    722             return None
    723 
    724         if user and password:
    725             user_pass = '%s:%s' % (unquote(user), unquote(password))
    726             creds = base64.b64encode(user_pass).strip()
    727             req.add_header('Proxy-authorization', 'Basic ' + creds)
    728         hostport = unquote(hostport)
    729         req.set_proxy(hostport, proxy_type)
    730 
    731         if orig_type == proxy_type or orig_type == 'https':
    732             # let other handlers take care of it

    733             return None
    734         else:
    735             # need to start over, because the other handlers don't

    736             # grok the proxy's URL type

    737             # e.g. if we have a constructor arg proxies like so:

    738             # {'http': 'ftp://proxy.example.com'}, we may end up turning

    739             # a request for http://acme.example.com/a into one for

    740             # ftp://proxy.example.com/a

    741             return self.parent.open(req, timeout=req.timeout)
    742 
    743 class HTTPPasswordMgr:
    744 
    745     def __init__(self):
    746         self.passwd = {}
    747 
    748     def add_password(self, realm, uri, user, passwd):
    749         # uri could be a single URI or a sequence

    750         if isinstance(uri, basestring):
    751             uri = [uri]
    752         if not realm in self.passwd:
    753             self.passwd[realm] = {}
    754         for default_port in True, False:
    755             reduced_uri = tuple(
    756                 [self.reduce_uri(u, default_port) for u in uri])
    757             self.passwd[realm][reduced_uri] = (user, passwd)
    758 
    759     def find_user_password(self, realm, authuri):
    760         domains = self.passwd.get(realm, {})
    761         for default_port in True, False:
    762             reduced_authuri = self.reduce_uri(authuri, default_port)
    763             for uris, authinfo in domains.iteritems():
    764                 for uri in uris:
    765                     if self.is_suburi(uri, reduced_authuri):
    766                         return authinfo
    767         return None, None
    768 
    769     def reduce_uri(self, uri, default_port=True):
    770         """Accept authority or URI and extract only the authority and path."""
    771         # note HTTP URLs do not have a userinfo component

    772         parts = urlparse.urlsplit(uri)
    773         if parts[1]:
    774             # URI

    775             scheme = parts[0]
    776             authority = parts[1]
    777             path = parts[2] or '/'
    778         else:
    779             # host or host:port

    780             scheme = None
    781             authority = uri
    782             path = '/'
    783         host, port = splitport(authority)
    784         if default_port and port is None and scheme is not None:
    785             dport = {"http": 80,
    786                      "https": 443,
    787                      }.get(scheme)
    788             if dport is not None:
    789                 authority = "%s:%d" % (host, dport)
    790         return authority, path
    791 
    792     def is_suburi(self, base, test):
    793         """Check if test is below base in a URI tree
    794 
    795         Both args must be URIs in reduced form.
    796         """
    797         if base == test:
    798             return True
    799         if base[0] != test[0]:
    800             return False
    801         common = posixpath.commonprefix((base[1], test[1]))
    802         if len(common) == len(base[1]):
    803             return True
    804         return False
    805 
    806 
    807 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
    808 
    809     def find_user_password(self, realm, authuri):
    810         user, password = HTTPPasswordMgr.find_user_password(self, realm,
    811                                                             authuri)
    812         if user is not None:
    813             return user, password
    814         return HTTPPasswordMgr.find_user_password(self, None, authuri)
    815 
    816 
    817 class AbstractBasicAuthHandler:
    818 
    819     # XXX this allows for multiple auth-schemes, but will stupidly pick

    820     # the last one with a realm specified.

    821 
    822     # allow for double- and single-quoted realm values

    823     # (single quotes are a violation of the RFC, but appear in the wild)

    824     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
    825                     'realm=(["\'])(.*?)\\2', re.I)
    826 
    827     # XXX could pre-emptively send auth info already accepted (RFC 2617,

    828     # end of section 2, and section 1.2 immediately after "credentials"

    829     # production).

    830 
    831     def __init__(self, password_mgr=None):
    832         if password_mgr is None:
    833             password_mgr = HTTPPasswordMgr()
    834         self.passwd = password_mgr
    835         self.add_password = self.passwd.add_password
    836         self.retried = 0
    837 
    838     def reset_retry_count(self):
    839         self.retried = 0
    840 
    841     def http_error_auth_reqed(self, authreq, host, req, headers):
    842         # host may be an authority (without userinfo) or a URL with an

    843         # authority

    844         # XXX could be multiple headers

    845         authreq = headers.get(authreq, None)
    846 
    847         if self.retried > 5:
    848             # retry sending the username:password 5 times before failing.

    849             raise HTTPError(req.get_full_url(), 401, "basic auth failed",
    850                             headers, None)
    851         else:
    852             self.retried += 1
    853 
    854         if authreq:
    855             mo = AbstractBasicAuthHandler.rx.search(authreq)
    856             if mo:
    857                 scheme, quote, realm = mo.groups()
    858                 if scheme.lower() == 'basic':
    859                     response = self.retry_http_basic_auth(host, req, realm)
    860                     if response and response.code != 401:
    861                         self.retried = 0
    862                     return response
    863 
    864     def retry_http_basic_auth(self, host, req, realm):
    865         user, pw = self.passwd.find_user_password(realm, host)
    866         if pw is not None:
    867             raw = "%s:%s" % (user, pw)
    868             auth = 'Basic %s' % base64.b64encode(raw).strip()
    869             if req.headers.get(self.auth_header, None) == auth:
    870                 return None
    871             req.add_unredirected_header(self.auth_header, auth)
    872             return self.parent.open(req, timeout=req.timeout)
    873         else:
    874             return None
    875 
    876 
    877 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    878 
    879     auth_header = 'Authorization'
    880 
    881     def http_error_401(self, req, fp, code, msg, headers):
    882         url = req.get_full_url()
    883         response = self.http_error_auth_reqed('www-authenticate',
    884                                               url, req, headers)
    885         self.reset_retry_count()
    886         return response
    887 
    888 
    889 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    890 
    891     auth_header = 'Proxy-authorization'
    892 
    893     def http_error_407(self, req, fp, code, msg, headers):
    894         # http_error_auth_reqed requires that there is no userinfo component in

    895         # authority.  Assume there isn't one, since urllib2 does not (and

    896         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing

    897         # userinfo.

    898         authority = req.get_host()
    899         response = self.http_error_auth_reqed('proxy-authenticate',
    900                                           authority, req, headers)
    901         self.reset_retry_count()
    902         return response
    903 
    904 
    905 def randombytes(n):
    906     """Return n random bytes."""
    907     # Use /dev/urandom if it is available.  Fall back to random module

    908     # if not.  It might be worthwhile to extend this function to use

    909     # other platform-specific mechanisms for getting random bytes.

    910     if os.path.exists("/dev/urandom"):
    911         f = open("/dev/urandom")
    912         s = f.read(n)
    913         f.close()
    914         return s
    915     else:
    916         L = [chr(random.randrange(0, 256)) for i in range(n)]
    917         return "".join(L)
    918 
    919 class AbstractDigestAuthHandler:
    920     # Digest authentication is specified in RFC 2617.

    921 
    922     # XXX The client does not inspect the Authentication-Info header

    923     # in a successful response.

    924 
    925     # XXX It should be possible to test this implementation against

    926     # a mock server that just generates a static set of challenges.

    927 
    928     # XXX qop="auth-int" supports is shaky

    929 
    930     def __init__(self, passwd=None):
    931         if passwd is None:
    932             passwd = HTTPPasswordMgr()
    933         self.passwd = passwd
    934         self.add_password = self.passwd.add_password
    935         self.retried = 0
    936         self.nonce_count = 0
    937         self.last_nonce = None
    938 
    939     def reset_retry_count(self):
    940         self.retried = 0
    941 
    942     def http_error_auth_reqed(self, auth_header, host, req, headers):
    943         authreq = headers.get(auth_header, None)
    944         if self.retried > 5:
    945             # Don't fail endlessly - if we failed once, we'll probably

    946             # fail a second time. Hm. Unless the Password Manager is

    947             # prompting for the information. Crap. This isn't great

    948             # but it's better than the current 'repeat until recursion

    949             # depth exceeded' approach <wink>

    950             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
    951                             headers, None)
    952         else:
    953             self.retried += 1
    954         if authreq:
    955             scheme = authreq.split()[0]
    956             if scheme.lower() == 'digest':
    957                 return self.retry_http_digest_auth(req, authreq)
    958 
    959     def retry_http_digest_auth(self, req, auth):
    960         token, challenge = auth.split(' ', 1)
    961         chal = parse_keqv_list(parse_http_list(challenge))
    962         auth = self.get_authorization(req, chal)
    963         if auth:
    964             auth_val = 'Digest %s' % auth
    965             if req.headers.get(self.auth_header, None) == auth_val:
    966                 return None
    967             req.add_unredirected_header(self.auth_header, auth_val)
    968             resp = self.parent.open(req, timeout=req.timeout)
    969             return resp
    970 
    971     def get_cnonce(self, nonce):
    972         # The cnonce-value is an opaque

    973         # quoted string value provided by the client and used by both client

    974         # and server to avoid chosen plaintext attacks, to provide mutual

    975         # authentication, and to provide some message integrity protection.

    976         # This isn't a fabulous effort, but it's probably Good Enough.

    977         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
    978                                             randombytes(8))).hexdigest()
    979         return dig[:16]
    980 
    981     def get_authorization(self, req, chal):
    982         try:
    983             realm = chal['realm']
    984             nonce = chal['nonce']
    985             qop = chal.get('qop')
    986             algorithm = chal.get('algorithm', 'MD5')
    987             # mod_digest doesn't send an opaque, even though it isn't

    988             # supposed to be optional

    989             opaque = chal.get('opaque', None)
    990         except KeyError:
    991             return None
    992 
    993         H, KD = self.get_algorithm_impls(algorithm)
    994         if H is None:
    995             return None
    996 
    997         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
    998         if user is None:
    999             return None
   1000 
   1001         # XXX not implemented yet

   1002         if req.has_data():
   1003             entdig = self.get_entity_digest(req.get_data(), chal)
   1004         else:
   1005             entdig = None
   1006 
   1007         A1 = "%s:%s:%s" % (user, realm, pw)
   1008         A2 = "%s:%s" % (req.get_method(),
   1009                         # XXX selector: what about proxies and full urls

   1010                         req.get_selector())
   1011         if qop == 'auth':
   1012             if nonce == self.last_nonce:
   1013                 self.nonce_count += 1
   1014             else:
   1015                 self.nonce_count = 1
   1016                 self.last_nonce = nonce
   1017 
   1018             ncvalue = '%08x' % self.nonce_count
   1019             cnonce = self.get_cnonce(nonce)
   1020             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
   1021             respdig = KD(H(A1), noncebit)
   1022         elif qop is None:
   1023             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
   1024         else:
   1025             # XXX handle auth-int.

   1026             raise URLError("qop '%s' is not supported." % qop)
   1027 
   1028         # XXX should the partial digests be encoded too?

   1029 
   1030         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
   1031                'response="%s"' % (user, realm, nonce, req.get_selector(),
   1032                                   respdig)
   1033         if opaque:
   1034             base += ', opaque="%s"' % opaque
   1035         if entdig:
   1036             base += ', digest="%s"' % entdig
   1037         base += ', algorithm="%s"' % algorithm
   1038         if qop:
   1039             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
   1040         return base
   1041 
   1042     def get_algorithm_impls(self, algorithm):
   1043         # algorithm should be case-insensitive according to RFC2617

   1044         algorithm = algorithm.upper()
   1045         # lambdas assume digest modules are imported at the top level

   1046         if algorithm == 'MD5':
   1047             H = lambda x: hashlib.md5(x).hexdigest()
   1048         elif algorithm == 'SHA':
   1049             H = lambda x: hashlib.sha1(x).hexdigest()
   1050         # XXX MD5-sess

   1051         KD = lambda s, d: H("%s:%s" % (s, d))
   1052         return H, KD
   1053 
   1054     def get_entity_digest(self, data, chal):
   1055         # XXX not implemented yet

   1056         return None
   1057 
   1058 
   1059 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1060     """An authentication protocol defined by RFC 2069
   1061 
   1062     Digest authentication improves on basic authentication because it
   1063     does not transmit passwords in the clear.
   1064     """
   1065 
   1066     auth_header = 'Authorization'
   1067     handler_order = 490  # before Basic auth

   1068 
   1069     def http_error_401(self, req, fp, code, msg, headers):
   1070         host = urlparse.urlparse(req.get_full_url())[1]
   1071         retry = self.http_error_auth_reqed('www-authenticate',
   1072                                            host, req, headers)
   1073         self.reset_retry_count()
   1074         return retry
   1075 
   1076 
   1077 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1078 
   1079     auth_header = 'Proxy-Authorization'
   1080     handler_order = 490  # before Basic auth

   1081 
   1082     def http_error_407(self, req, fp, code, msg, headers):
   1083         host = req.get_host()
   1084         retry = self.http_error_auth_reqed('proxy-authenticate',
   1085                                            host, req, headers)
   1086         self.reset_retry_count()
   1087         return retry
   1088 
   1089 class AbstractHTTPHandler(BaseHandler):
   1090 
   1091     def __init__(self, debuglevel=0):
   1092         self._debuglevel = debuglevel
   1093 
   1094     def set_http_debuglevel(self, level):
   1095         self._debuglevel = level
   1096 
   1097     def do_request_(self, request):
   1098         host = request.get_host()
   1099         if not host:
   1100             raise URLError('no host given')
   1101 
   1102         if request.has_data():  # POST

   1103             data = request.get_data()
   1104             if not request.has_header('Content-type'):
   1105                 request.add_unredirected_header(
   1106                     'Content-type',
   1107                     'application/x-www-form-urlencoded')
   1108             if not request.has_header('Content-length'):
   1109                 request.add_unredirected_header(
   1110                     'Content-length', '%d' % len(data))
   1111 
   1112         sel_host = host
   1113         if request.has_proxy():
   1114             scheme, sel = splittype(request.get_selector())
   1115             sel_host, sel_path = splithost(sel)
   1116 
   1117         if not request.has_header('Host'):
   1118             request.add_unredirected_header('Host', sel_host)
   1119         for name, value in self.parent.addheaders:
   1120             name = name.capitalize()
   1121             if not request.has_header(name):
   1122                 request.add_unredirected_header(name, value)
   1123 
   1124         return request
   1125 
   1126     def do_open(self, http_class, req):
   1127         """Return an addinfourl object for the request, using http_class.
   1128 
   1129         http_class must implement the HTTPConnection API from httplib.
   1130         The addinfourl return value is a file-like object.  It also
   1131         has methods and attributes including:
   1132             - info(): return a mimetools.Message object for the headers
   1133             - geturl(): return the original request URL
   1134             - code: HTTP status code
   1135         """
   1136         host = req.get_host()
   1137         if not host:
   1138             raise URLError('no host given')
   1139 
   1140         h = http_class(host, timeout=req.timeout) # will parse host:port

   1141         h.set_debuglevel(self._debuglevel)
   1142 
   1143         headers = dict(req.unredirected_hdrs)
   1144         headers.update(dict((k, v) for k, v in req.headers.items()
   1145                             if k not in headers))
   1146 
   1147         # We want to make an HTTP/1.1 request, but the addinfourl

   1148         # class isn't prepared to deal with a persistent connection.

   1149         # It will try to read all remaining data from the socket,

   1150         # which will block while the server waits for the next request.

   1151         # So make sure the connection gets closed after the (only)

   1152         # request.

   1153         headers["Connection"] = "close"
   1154         headers = dict(
   1155             (name.title(), val) for name, val in headers.items())
   1156 
   1157         if req._tunnel_host:
   1158             tunnel_headers = {}
   1159             proxy_auth_hdr = "Proxy-Authorization"
   1160             if proxy_auth_hdr in headers:
   1161                 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
   1162                 # Proxy-Authorization should not be sent to origin

   1163                 # server.

   1164                 del headers[proxy_auth_hdr]
   1165             h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
   1166 
   1167         try:
   1168             h.request(req.get_method(), req.get_selector(), req.data, headers)
   1169             try:
   1170                 r = h.getresponse(buffering=True)
   1171             except TypeError: #buffering kw not supported

   1172                 r = h.getresponse()
   1173         except socket.error, err: # XXX what error?

   1174             raise URLError(err)
   1175 
   1176         # Pick apart the HTTPResponse object to get the addinfourl

   1177         # object initialized properly.

   1178 
   1179         # Wrap the HTTPResponse object in socket's file object adapter

   1180         # for Windows.  That adapter calls recv(), so delegate recv()

   1181         # to read().  This weird wrapping allows the returned object to

   1182         # have readline() and readlines() methods.

   1183 
   1184         # XXX It might be better to extract the read buffering code

   1185         # out of socket._fileobject() and into a base class.

   1186 
   1187         r.recv = r.read
   1188         fp = socket._fileobject(r, close=True)
   1189 
   1190         resp = addinfourl(fp, r.msg, req.get_full_url())
   1191         resp.code = r.status
   1192         resp.msg = r.reason
   1193         return resp
   1194 
   1195 
   1196 class HTTPHandler(AbstractHTTPHandler):
   1197 
   1198     def http_open(self, req):
   1199         return self.do_open(httplib.HTTPConnection, req)
   1200 
   1201     http_request = AbstractHTTPHandler.do_request_
   1202 
   1203 if hasattr(httplib, 'HTTPS'):
   1204     class HTTPSHandler(AbstractHTTPHandler):
   1205 
   1206         def https_open(self, req):
   1207             return self.do_open(httplib.HTTPSConnection, req)
   1208 
   1209         https_request = AbstractHTTPHandler.do_request_
   1210 
   1211 class HTTPCookieProcessor(BaseHandler):
   1212     def __init__(self, cookiejar=None):
   1213         import cookielib
   1214         if cookiejar is None:
   1215             cookiejar = cookielib.CookieJar()
   1216         self.cookiejar = cookiejar
   1217 
   1218     def http_request(self, request):
   1219         self.cookiejar.add_cookie_header(request)
   1220         return request
   1221 
   1222     def http_response(self, request, response):
   1223         self.cookiejar.extract_cookies(response, request)
   1224         return response
   1225 
   1226     https_request = http_request
   1227     https_response = http_response
   1228 
   1229 class UnknownHandler(BaseHandler):
   1230     def unknown_open(self, req):
   1231         type = req.get_type()
   1232         raise URLError('unknown url type: %s' % type)
   1233 
   1234 def parse_keqv_list(l):
   1235     """Parse list of key=value strings where keys are not duplicated."""
   1236     parsed = {}
   1237     for elt in l:
   1238         k, v = elt.split('=', 1)
   1239         if v[0] == '"' and v[-1] == '"':
   1240             v = v[1:-1]
   1241         parsed[k] = v
   1242     return parsed
   1243 
   1244 def parse_http_list(s):
   1245     """Parse lists as described by RFC 2068 Section 2.
   1246 
   1247     In particular, parse comma-separated lists where the elements of
   1248     the list may include quoted-strings.  A quoted-string could
   1249     contain a comma.  A non-quoted string could have quotes in the
   1250     middle.  Neither commas nor quotes count if they are escaped.
   1251     Only double-quotes count, not single-quotes.
   1252     """
   1253     res = []
   1254     part = ''
   1255 
   1256     escape = quote = False
   1257     for cur in s:
   1258         if escape:
   1259             part += cur
   1260             escape = False
   1261             continue
   1262         if quote:
   1263             if cur == '\\':
   1264                 escape = True
   1265                 continue
   1266             elif cur == '"':
   1267                 quote = False
   1268             part += cur
   1269             continue
   1270 
   1271         if cur == ',':
   1272             res.append(part)
   1273             part = ''
   1274             continue
   1275 
   1276         if cur == '"':
   1277             quote = True
   1278 
   1279         part += cur
   1280 
   1281     # append last part
   1282     if part:
   1283         res.append(part)
   1284 
   1285     return [part.strip() for part in res]
   1286 
   1287 def _safe_gethostbyname(host):
   1288     try:
   1289         return socket.gethostbyname(host)
   1290     except socket.gaierror:
   1291         return None
   1292 
   1293 class FileHandler(BaseHandler):
   1294     # Use local file or FTP depending on form of URL
   1295     def file_open(self, req):
   1296         url = req.get_selector()
   1297         if url[:2] == '//' and url[2:3] != '/' and (req.host and
   1298                 req.host != 'localhost'):
   1299             req.type = 'ftp'
   1300             return self.parent.open(req)
   1301         else:
   1302             return self.open_local_file(req)
   1303 
   1304     # names for the localhost
   1305     names = None
   1306     def get_names(self):
   1307         if FileHandler.names is None:
   1308             try:
   1309                 FileHandler.names = tuple(
   1310                     socket.gethostbyname_ex('localhost')[2] +
   1311                     socket.gethostbyname_ex(socket.gethostname())[2])
   1312             except socket.gaierror:
   1313                 FileHandler.names = (socket.gethostbyname('localhost'),)
   1314         return FileHandler.names
   1315 
   1316     # not entirely sure what the rules are here
   1317     def open_local_file(self, req):
   1318         import email.utils
   1319         import mimetypes
   1320         host = req.get_host()
   1321         filename = req.get_selector()
   1322         localfile = url2pathname(filename)
   1323         try:
   1324             stats = os.stat(localfile)
   1325             size = stats.st_size
   1326             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
   1327             mtype = mimetypes.guess_type(filename)[0]
   1328             headers = mimetools.Message(StringIO(
   1329                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
   1330                 (mtype or 'text/plain', size, modified)))
   1331             if host:
   1332                 host, port = splitport(host)
   1333             if not host or \
   1334                 (not port and _safe_gethostbyname(host) in self.get_names()):
   1335                 if host:
   1336                     origurl = 'file://' + host + filename
   1337                 else:
   1338                     origurl = 'file://' + filename
   1339                 return addinfourl(open(localfile, 'rb'), headers, origurl)
   1340         except OSError, msg:
   1341             # urllib2 users shouldn't expect OSErrors coming from urlopen()
   1342             raise URLError(msg)
   1343         raise URLError('file not on local host')
   1344 
   1345 class FTPHandler(BaseHandler):
   1346     def ftp_open(self, req):
   1347         import ftplib
   1348         import mimetypes
   1349         host = req.get_host()
   1350         if not host:
   1351             raise URLError('ftp error: no host given')
   1352         host, port = splitport(host)
   1353         if port is None:
   1354             port = ftplib.FTP_PORT
   1355         else:
   1356             port = int(port)
   1357 
   1358         # username/password handling

   1359         user, host = splituser(host)
   1360         if user:
   1361             user, passwd = splitpasswd(user)
   1362         else:
   1363             passwd = None
   1364         host = unquote(host)
   1365         user = user or ''
   1366         passwd = passwd or ''
   1367 
   1368         try:
   1369             host = socket.gethostbyname(host)
   1370         except socket.error, msg:
   1371             raise URLError(msg)
   1372         path, attrs = splitattr(req.get_selector())
   1373         dirs = path.split('/')
   1374         dirs = map(unquote, dirs)
   1375         dirs, file = dirs[:-1], dirs[-1]
   1376         if dirs and not dirs[0]:
   1377             dirs = dirs[1:]
   1378         try:
   1379             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
   1380             type = file and 'I' or 'D'
   1381             for attr in attrs:
   1382                 attr, value = splitvalue(attr)
   1383                 if attr.lower() == 'type' and \
   1384                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
   1385                     type = value.upper()
   1386             fp, retrlen = fw.retrfile(file, type)
   1387             headers = ""
   1388             mtype = mimetypes.guess_type(req.get_full_url())[0]
   1389             if mtype:
   1390                 headers += "Content-type: %s\n" % mtype
   1391             if retrlen is not None and retrlen >= 0:
   1392                 headers += "Content-length: %d\n" % retrlen
   1393             sf = StringIO(headers)
   1394             headers = mimetools.Message(sf)
   1395             return addinfourl(fp, headers, req.get_full_url())
   1396         except ftplib.all_errors, msg:
   1397             raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
   1398 
   1399     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1400         fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
   1401 ##        fw.ftp.set_debuglevel(1)

   1402         return fw
   1403 
   1404 class CacheFTPHandler(FTPHandler):
   1405     # XXX would be nice to have pluggable cache strategies

   1406     # XXX this stuff is definitely not thread safe

   1407     def __init__(self):
   1408         self.cache = {}
   1409         self.timeout = {}
   1410         self.soonest = 0
   1411         self.delay = 60
   1412         self.max_conns = 16
   1413 
   1414     def setTimeout(self, t):
   1415         self.delay = t
   1416 
   1417     def setMaxConns(self, m):
   1418         self.max_conns = m
   1419 
   1420     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1421         key = user, host, port, '/'.join(dirs), timeout
   1422         if key in self.cache:
   1423             self.timeout[key] = time.time() + self.delay
   1424         else:
   1425             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
   1426             self.timeout[key] = time.time() + self.delay
   1427         self.check_cache()
   1428         return self.cache[key]
   1429 
   1430     def check_cache(self):
   1431         # first check for old ones

   1432         t = time.time()
   1433         if self.soonest <= t:
   1434             for k, v in self.timeout.items():
   1435                 if v < t:
   1436                     self.cache[k].close()
   1437                     del self.cache[k]
   1438                     del self.timeout[k]
   1439         self.soonest = min(self.timeout.values())
   1440 
   1441         # then check the size

   1442         if len(self.cache) == self.max_conns:
   1443             for k, v in self.timeout.items():
   1444                 if v == self.soonest:
   1445                     del self.cache[k]
   1446                     del self.timeout[k]
   1447                     break
   1448             self.soonest = min(self.timeout.values())
   1449