Home | History | Annotate | Download | only in python2.7
      1 """An extensible library for opening URLs using a variety of protocols
      2 
      3 The simplest way to use this module is to call the urlopen function,
      4 which accepts a string containing a URL or a Request object (described
      5 below).  It opens the URL and returns the results as file-like
      6 object; the returned object has some extra methods described below.
      7 
      8 The OpenerDirector manages a collection of Handler objects that do
      9 all the actual work.  Each Handler implements a particular protocol or
     10 option.  The OpenerDirector is a composite object that invokes the
     11 Handlers needed to open the requested URL.  For example, the
     12 HTTPHandler performs HTTP GET and POST requests and deals with
     13 non-error returns.  The HTTPRedirectHandler automatically deals with
     14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
     15 deals with digest authentication.
     16 
     17 urlopen(url, data=None) -- Basic usage is the same as original
     18 urllib.  pass the url and optionally data to post to an HTTP URL, and
     19 get a file-like object back.  One difference is that you can also pass
     20 a Request instance instead of URL.  Raises a URLError (subclass of
     21 IOError); for HTTP errors, raises an HTTPError, which can also be
     22 treated as a valid response.
     23 
     24 build_opener -- Function that creates a new OpenerDirector instance.
     25 Will install the default handlers.  Accepts one or more Handlers as
     26 arguments, either instances or Handler classes that it will
     27 instantiate.  If one of the argument is a subclass of the default
     28 handler, the argument will be installed instead of the default.
     29 
     30 install_opener -- Installs a new opener as the default opener.
     31 
     32 objects of interest:
     33 
     34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
     35 the Handler classes, while dealing with requests and responses.
     36 
     37 Request -- An object that encapsulates the state of a request.  The
     38 state can be as simple as the URL.  It can also include extra HTTP
     39 headers, e.g. a User-Agent.
     40 
     41 BaseHandler --
     42 
     43 exceptions:
     44 URLError -- A subclass of IOError, individual protocols have their own
     45 specific subclass.
     46 
     47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
     48 as an exceptional event or valid response.
     49 
     50 internals:
     51 BaseHandler and parent
     52 _call_chain conventions
     53 
     54 Example usage:
     55 
     56 import urllib2
     57 
     58 # set up authentication info
     59 authinfo = urllib2.HTTPBasicAuthHandler()
     60 authinfo.add_password(realm='PDQ Application',
     61                       uri='https://mahler:8092/site-updates.py',
     62                       user='klem',
     63                       passwd='geheim$parole')
     64 
     65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
     66 
     67 # build a new opener that adds authentication and caching FTP handlers
     68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
     69 
     70 # install it
     71 urllib2.install_opener(opener)
     72 
     73 f = urllib2.urlopen('http://www.python.org/')
     74 
     75 
     76 """
     77 
     78 # XXX issues:
     79 # If an authentication error handler that tries to perform
     80 # authentication for some reason but fails, how should the error be
     81 # signalled?  The client needs to know the HTTP error code.  But if
     82 # the handler knows that the problem was, e.g., that it didn't know
     83 # that hash algo that requested in the challenge, it would be good to
     84 # pass that information along to the client, too.
     85 # ftp errors aren't handled cleanly
     86 # check digest against correct (i.e. non-apache) implementation
     87 
     88 # Possible extensions:
     89 # complex proxies  XXX not sure what exactly was meant by this
     90 # abstract factory for opener
     91 
     92 import base64
     93 import hashlib
     94 import httplib
     95 import mimetools
     96 import os
     97 import posixpath
     98 import random
     99 import re
    100 import socket
    101 import sys
    102 import time
    103 import urlparse
    104 import bisect
    105 import warnings
    106 
    107 try:
    108     from cStringIO import StringIO
    109 except ImportError:
    110     from StringIO import StringIO
    111 
    112 from urllib import (unwrap, unquote, splittype, splithost, quote,
    113      addinfourl, splitport, splittag, toBytes,
    114      splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
    115 
    116 # support for FileHandler, proxies via environment variables
    117 from urllib import localhost, url2pathname, getproxies, proxy_bypass
    118 
    119 # used in User-Agent header sent
    120 __version__ = sys.version[:3]
    121 
    122 _opener = None
    123 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    124     global _opener
    125     if _opener is None:
    126         _opener = build_opener()
    127     return _opener.open(url, data, timeout)
    128 
    129 def install_opener(opener):
    130     global _opener
    131     _opener = opener
    132 
    133 # do these error classes make sense?
    134 # make sure all of the IOError stuff is overridden.  we just want to be
    135 # subtypes.
    136 
    137 class URLError(IOError):
    138     # URLError is a sub-type of IOError, but it doesn't share any of
    139     # the implementation.  need to override __init__ and __str__.
    140     # It sets self.args for compatibility with other EnvironmentError
    141     # subclasses, but args doesn't have the typical format with errno in
    142     # slot 0 and strerror in slot 1.  This may be better than nothing.
    143     def __init__(self, reason):
    144         self.args = reason,
    145         self.reason = reason
    146 
    147     def __str__(self):
    148         return '<urlopen error %s>' % self.reason
    149 
    150 class HTTPError(URLError, addinfourl):
    151     """Raised when HTTP error occurs, but also acts like non-error return"""
    152     __super_init = addinfourl.__init__
    153 
    154     def __init__(self, url, code, msg, hdrs, fp):
    155         self.code = code
    156         self.msg = msg
    157         self.hdrs = hdrs
    158         self.fp = fp
    159         self.filename = url
    160         # The addinfourl classes depend on fp being a valid file
    161         # object.  In some cases, the HTTPError may not have a valid
    162         # file object.  If this happens, the simplest workaround is to
    163         # not initialize the base classes.
    164         if fp is not None:
    165             self.__super_init(fp, hdrs, url, code)
    166 
    167     def __str__(self):
    168         return 'HTTP Error %s: %s' % (self.code, self.msg)
    169 
    170     # since URLError specifies a .reason attribute, HTTPError should also
    171     #  provide this attribute. See issue13211 fo discussion.
    172     @property
    173     def reason(self):
    174         return self.msg
    175 
    176     def info(self):
    177         return self.hdrs
    178 
    179 # copied from cookielib.py
    180 _cut_port_re = re.compile(r":\d+$")
    181 def request_host(request):
    182     """Return request-host, as defined by RFC 2965.
    183 
    184     Variation from RFC: returned value is lowercased, for convenient
    185     comparison.
    186 
    187     """
    188     url = request.get_full_url()
    189     host = urlparse.urlparse(url)[1]
    190     if host == "":
    191         host = request.get_header("Host", "")
    192 
    193     # remove port, if present
    194     host = _cut_port_re.sub("", host, 1)
    195     return host.lower()
    196 
    197 class Request:
    198 
    199     def __init__(self, url, data=None, headers={},
    200                  origin_req_host=None, unverifiable=False):
    201         # unwrap('<URL:type://host/path>') --> 'type://host/path'
    202         self.__original = unwrap(url)
    203         self.__original, self.__fragment = splittag(self.__original)
    204         self.type = None
    205         # self.__r_type is what's left after doing the splittype
    206         self.host = None
    207         self.port = None
    208         self._tunnel_host = None
    209         self.data = data
    210         self.headers = {}
    211         for key, value in headers.items():
    212             self.add_header(key, value)
    213         self.unredirected_hdrs = {}
    214         if origin_req_host is None:
    215             origin_req_host = request_host(self)
    216         self.origin_req_host = origin_req_host
    217         self.unverifiable = unverifiable
    218 
    219     def __getattr__(self, attr):
    220         # XXX this is a fallback mechanism to guard against these
    221         # methods getting called in a non-standard order.  this may be
    222         # too complicated and/or unnecessary.
    223         # XXX should the __r_XXX attributes be public?
    224         if attr[:12] == '_Request__r_':
    225             name = attr[12:]
    226             if hasattr(Request, 'get_' + name):
    227                 getattr(self, 'get_' + name)()
    228                 return getattr(self, attr)
    229         raise AttributeError, attr
    230 
    231     def get_method(self):
    232         if self.has_data():
    233             return "POST"
    234         else:
    235             return "GET"
    236 
    237     # XXX these helper methods are lame
    238 
    239     def add_data(self, data):
    240         self.data = data
    241 
    242     def has_data(self):
    243         return self.data is not None
    244 
    245     def get_data(self):
    246         return self.data
    247 
    248     def get_full_url(self):
    249         if self.__fragment:
    250             return '%s#%s' % (self.__original, self.__fragment)
    251         else:
    252             return self.__original
    253 
    254     def get_type(self):
    255         if self.type is None:
    256             self.type, self.__r_type = splittype(self.__original)
    257             if self.type is None:
    258                 raise ValueError, "unknown url type: %s" % self.__original
    259         return self.type
    260 
    261     def get_host(self):
    262         if self.host is None:
    263             self.host, self.__r_host = splithost(self.__r_type)
    264             if self.host:
    265                 self.host = unquote(self.host)
    266         return self.host
    267 
    268     def get_selector(self):
    269         return self.__r_host
    270 
    271     def set_proxy(self, host, type):
    272         if self.type == 'https' and not self._tunnel_host:
    273             self._tunnel_host = self.host
    274         else:
    275             self.type = type
    276             self.__r_host = self.__original
    277 
    278         self.host = host
    279 
    280     def has_proxy(self):
    281         return self.__r_host == self.__original
    282 
    283     def get_origin_req_host(self):
    284         return self.origin_req_host
    285 
    286     def is_unverifiable(self):
    287         return self.unverifiable
    288 
    289     def add_header(self, key, val):
    290         # useful for something like authentication
    291         self.headers[key.capitalize()] = val
    292 
    293     def add_unredirected_header(self, key, val):
    294         # will not be added to a redirected request
    295         self.unredirected_hdrs[key.capitalize()] = val
    296 
    297     def has_header(self, header_name):
    298         return (header_name in self.headers or
    299                 header_name in self.unredirected_hdrs)
    300 
    301     def get_header(self, header_name, default=None):
    302         return self.headers.get(
    303             header_name,
    304             self.unredirected_hdrs.get(header_name, default))
    305 
    306     def header_items(self):
    307         hdrs = self.unredirected_hdrs.copy()
    308         hdrs.update(self.headers)
    309         return hdrs.items()
    310 
    311 class OpenerDirector:
    312     def __init__(self):
    313         client_version = "Python-urllib/%s" % __version__
    314         self.addheaders = [('User-agent', client_version)]
    315         # self.handlers is retained only for backward compatibility
    316         self.handlers = []
    317         # manage the individual handlers
    318         self.handle_open = {}
    319         self.handle_error = {}
    320         self.process_response = {}
    321         self.process_request = {}
    322 
    323     def add_handler(self, handler):
    324         if not hasattr(handler, "add_parent"):
    325             raise TypeError("expected BaseHandler instance, got %r" %
    326                             type(handler))
    327 
    328         added = False
    329         for meth in dir(handler):
    330             if meth in ["redirect_request", "do_open", "proxy_open"]:
    331                 # oops, coincidental match
    332                 continue
    333 
    334             i = meth.find("_")
    335             protocol = meth[:i]
    336             condition = meth[i+1:]
    337 
    338             if condition.startswith("error"):
    339                 j = condition.find("_") + i + 1
    340                 kind = meth[j+1:]
    341                 try:
    342                     kind = int(kind)
    343                 except ValueError:
    344                     pass
    345                 lookup = self.handle_error.get(protocol, {})
    346                 self.handle_error[protocol] = lookup
    347             elif condition == "open":
    348                 kind = protocol
    349                 lookup = self.handle_open
    350             elif condition == "response":
    351                 kind = protocol
    352                 lookup = self.process_response
    353             elif condition == "request":
    354                 kind = protocol
    355                 lookup = self.process_request
    356             else:
    357                 continue
    358 
    359             handlers = lookup.setdefault(kind, [])
    360             if handlers:
    361                 bisect.insort(handlers, handler)
    362             else:
    363                 handlers.append(handler)
    364             added = True
    365 
    366         if added:
    367             bisect.insort(self.handlers, handler)
    368             handler.add_parent(self)
    369 
    370     def close(self):
    371         # Only exists for backwards compatibility.
    372         pass
    373 
    374     def _call_chain(self, chain, kind, meth_name, *args):
    375         # Handlers raise an exception if no one else should try to handle
    376         # the request, or return None if they can't but another handler
    377         # could.  Otherwise, they return the response.
    378         handlers = chain.get(kind, ())
    379         for handler in handlers:
    380             func = getattr(handler, meth_name)
    381 
    382             result = func(*args)
    383             if result is not None:
    384                 return result
    385 
    386     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    387         # accept a URL or a Request object
    388         if isinstance(fullurl, basestring):
    389             req = Request(fullurl, data)
    390         else:
    391             req = fullurl
    392             if data is not None:
    393                 req.add_data(data)
    394 
    395         req.timeout = timeout
    396         protocol = req.get_type()
    397 
    398         # pre-process request
    399         meth_name = protocol+"_request"
    400         for processor in self.process_request.get(protocol, []):
    401             meth = getattr(processor, meth_name)
    402             req = meth(req)
    403 
    404         response = self._open(req, data)
    405 
    406         # post-process response
    407         meth_name = protocol+"_response"
    408         for processor in self.process_response.get(protocol, []):
    409             meth = getattr(processor, meth_name)
    410             response = meth(req, response)
    411 
    412         return response
    413 
    414     def _open(self, req, data=None):
    415         result = self._call_chain(self.handle_open, 'default',
    416                                   'default_open', req)
    417         if result:
    418             return result
    419 
    420         protocol = req.get_type()
    421         result = self._call_chain(self.handle_open, protocol, protocol +
    422                                   '_open', req)
    423         if result:
    424             return result
    425 
    426         return self._call_chain(self.handle_open, 'unknown',
    427                                 'unknown_open', req)
    428 
    429     def error(self, proto, *args):
    430         if proto in ('http', 'https'):
    431             # XXX http[s] protocols are special-cased
    432             dict = self.handle_error['http'] # https is not different than http
    433             proto = args[2]  # YUCK!
    434             meth_name = 'http_error_%s' % proto
    435             http_err = 1
    436             orig_args = args
    437         else:
    438             dict = self.handle_error
    439             meth_name = proto + '_error'
    440             http_err = 0
    441         args = (dict, proto, meth_name) + args
    442         result = self._call_chain(*args)
    443         if result:
    444             return result
    445 
    446         if http_err:
    447             args = (dict, 'default', 'http_error_default') + orig_args
    448             return self._call_chain(*args)
    449 
    450 # XXX probably also want an abstract factory that knows when it makes
    451 # sense to skip a superclass in favor of a subclass and when it might
    452 # make sense to include both
    453 
    454 def build_opener(*handlers):
    455     """Create an opener object from a list of handlers.
    456 
    457     The opener will use several default handlers, including support
    458     for HTTP, FTP and when applicable, HTTPS.
    459 
    460     If any of the handlers passed as arguments are subclasses of the
    461     default handlers, the default handlers will not be used.
    462     """
    463     import types
    464     def isclass(obj):
    465         return isinstance(obj, (types.ClassType, type))
    466 
    467     opener = OpenerDirector()
    468     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
    469                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
    470                        FTPHandler, FileHandler, HTTPErrorProcessor]
    471     if hasattr(httplib, 'HTTPS'):
    472         default_classes.append(HTTPSHandler)
    473     skip = set()
    474     for klass in default_classes:
    475         for check in handlers:
    476             if isclass(check):
    477                 if issubclass(check, klass):
    478                     skip.add(klass)
    479             elif isinstance(check, klass):
    480                 skip.add(klass)
    481     for klass in skip:
    482         default_classes.remove(klass)
    483 
    484     for klass in default_classes:
    485         opener.add_handler(klass())
    486 
    487     for h in handlers:
    488         if isclass(h):
    489             h = h()
    490         opener.add_handler(h)
    491     return opener
    492 
    493 class BaseHandler:
    494     handler_order = 500
    495 
    496     def add_parent(self, parent):
    497         self.parent = parent
    498 
    499     def close(self):
    500         # Only exists for backwards compatibility
    501         pass
    502 
    503     def __lt__(self, other):
    504         if not hasattr(other, "handler_order"):
    505             # Try to preserve the old behavior of having custom classes
    506             # inserted after default ones (works only for custom user
    507             # classes which are not aware of handler_order).
    508             return True
    509         return self.handler_order < other.handler_order
    510 
    511 
    512 class HTTPErrorProcessor(BaseHandler):
    513     """Process HTTP error responses."""
    514     handler_order = 1000  # after all other processing
    515 
    516     def http_response(self, request, response):
    517         code, msg, hdrs = response.code, response.msg, response.info()
    518 
    519         # According to RFC 2616, "2xx" code indicates that the client's
    520         # request was successfully received, understood, and accepted.
    521         if not (200 <= code < 300):
    522             response = self.parent.error(
    523                 'http', request, response, code, msg, hdrs)
    524 
    525         return response
    526 
    527     https_response = http_response
    528 
    529 class HTTPDefaultErrorHandler(BaseHandler):
    530     def http_error_default(self, req, fp, code, msg, hdrs):
    531         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
    532 
    533 class HTTPRedirectHandler(BaseHandler):
    534     # maximum number of redirections to any single URL
    535     # this is needed because of the state that cookies introduce
    536     max_repeats = 4
    537     # maximum total number of redirections (regardless of URL) before
    538     # assuming we're in a loop
    539     max_redirections = 10
    540 
    541     def redirect_request(self, req, fp, code, msg, headers, newurl):
    542         """Return a Request or None in response to a redirect.
    543 
    544         This is called by the http_error_30x methods when a
    545         redirection response is received.  If a redirection should
    546         take place, return a new Request to allow http_error_30x to
    547         perform the redirect.  Otherwise, raise HTTPError if no-one
    548         else should try to handle this url.  Return None if you can't
    549         but another Handler might.
    550         """
    551         m = req.get_method()
    552         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
    553             or code in (301, 302, 303) and m == "POST"):
    554             # Strictly (according to RFC 2616), 301 or 302 in response
    555             # to a POST MUST NOT cause a redirection without confirmation
    556             # from the user (of urllib2, in this case).  In practice,
    557             # essentially all clients do redirect in this case, so we
    558             # do the same.
    559             # be conciliant with URIs containing a space
    560             newurl = newurl.replace(' ', '%20')
    561             newheaders = dict((k,v) for k,v in req.headers.items()
    562                               if k.lower() not in ("content-length", "content-type")
    563                              )
    564             return Request(newurl,
    565                            headers=newheaders,
    566                            origin_req_host=req.get_origin_req_host(),
    567                            unverifiable=True)
    568         else:
    569             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
    570 
    571     # Implementation note: To avoid the server sending us into an
    572     # infinite loop, the request object needs to track what URLs we
    573     # have already seen.  Do this by adding a handler-specific
    574     # attribute to the Request object.
    575     def http_error_302(self, req, fp, code, msg, headers):
    576         # Some servers (incorrectly) return multiple Location headers
    577         # (so probably same goes for URI).  Use first header.
    578         if 'location' in headers:
    579             newurl = headers.getheaders('location')[0]
    580         elif 'uri' in headers:
    581             newurl = headers.getheaders('uri')[0]
    582         else:
    583             return
    584 
    585         # fix a possible malformed URL
    586         urlparts = urlparse.urlparse(newurl)
    587         if not urlparts.path:
    588             urlparts = list(urlparts)
    589             urlparts[2] = "/"
    590         newurl = urlparse.urlunparse(urlparts)
    591 
    592         newurl = urlparse.urljoin(req.get_full_url(), newurl)
    593 
    594         # For security reasons we do not allow redirects to protocols
    595         # other than HTTP, HTTPS or FTP.
    596         newurl_lower = newurl.lower()
    597         if not (newurl_lower.startswith('http://') or
    598                 newurl_lower.startswith('https://') or
    599                 newurl_lower.startswith('ftp://')):
    600             raise HTTPError(newurl, code,
    601                             msg + " - Redirection to url '%s' is not allowed" %
    602                             newurl,
    603                             headers, fp)
    604 
    605         # XXX Probably want to forget about the state of the current
    606         # request, although that might interact poorly with other
    607         # handlers that also use handler-specific request attributes
    608         new = self.redirect_request(req, fp, code, msg, headers, newurl)
    609         if new is None:
    610             return
    611 
    612         # loop detection
    613         # .redirect_dict has a key url if url was previously visited.
    614         if hasattr(req, 'redirect_dict'):
    615             visited = new.redirect_dict = req.redirect_dict
    616             if (visited.get(newurl, 0) >= self.max_repeats or
    617                 len(visited) >= self.max_redirections):
    618                 raise HTTPError(req.get_full_url(), code,
    619                                 self.inf_msg + msg, headers, fp)
    620         else:
    621             visited = new.redirect_dict = req.redirect_dict = {}
    622         visited[newurl] = visited.get(newurl, 0) + 1
    623 
    624         # Don't close the fp until we are sure that we won't use it
    625         # with HTTPError.
    626         fp.read()
    627         fp.close()
    628 
    629         return self.parent.open(new, timeout=req.timeout)
    630 
    631     http_error_301 = http_error_303 = http_error_307 = http_error_302
    632 
    633     inf_msg = "The HTTP server returned a redirect error that would " \
    634               "lead to an infinite loop.\n" \
    635               "The last 30x error message was:\n"
    636 
    637 
    638 def _parse_proxy(proxy):
    639     """Return (scheme, user, password, host/port) given a URL or an authority.
    640 
    641     If a URL is supplied, it must have an authority (host:port) component.
    642     According to RFC 3986, having an authority component means the URL must
    643     have two slashes after the scheme:
    644 
    645     >>> _parse_proxy('file:/ftp.example.com/')
    646     Traceback (most recent call last):
    647     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    648 
    649     The first three items of the returned tuple may be None.
    650 
    651     Examples of authority parsing:
    652 
    653     >>> _parse_proxy('proxy.example.com')
    654     (None, None, None, 'proxy.example.com')
    655     >>> _parse_proxy('proxy.example.com:3128')
    656     (None, None, None, 'proxy.example.com:3128')
    657 
    658     The authority component may optionally include userinfo (assumed to be
    659     username:password):
    660 
    661     >>> _parse_proxy('joe:password (at] proxy.example.com')
    662     (None, 'joe', 'password', 'proxy.example.com')
    663     >>> _parse_proxy('joe:password (at] proxy.example.com:3128')
    664     (None, 'joe', 'password', 'proxy.example.com:3128')
    665 
    666     Same examples, but with URLs instead:
    667 
    668     >>> _parse_proxy('http://proxy.example.com/')
    669     ('http', None, None, 'proxy.example.com')
    670     >>> _parse_proxy('http://proxy.example.com:3128/')
    671     ('http', None, None, 'proxy.example.com:3128')
    672     >>> _parse_proxy('http://joe:password@proxy.example.com/')
    673     ('http', 'joe', 'password', 'proxy.example.com')
    674     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    675     ('http', 'joe', 'password', 'proxy.example.com:3128')
    676 
    677     Everything after the authority is ignored:
    678 
    679     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    680     ('ftp', 'joe', 'password', 'proxy.example.com')
    681 
    682     Test for no trailing '/' case:
    683 
    684     >>> _parse_proxy('http://joe:password@proxy.example.com')
    685     ('http', 'joe', 'password', 'proxy.example.com')
    686 
    687     """
    688     scheme, r_scheme = splittype(proxy)
    689     if not r_scheme.startswith("/"):
    690         # authority
    691         scheme = None
    692         authority = proxy
    693     else:
    694         # URL
    695         if not r_scheme.startswith("//"):
    696             raise ValueError("proxy URL with no authority: %r" % proxy)
    697         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
    698         # and 3.3.), path is empty or starts with '/'
    699         end = r_scheme.find("/", 2)
    700         if end == -1:
    701             end = None
    702         authority = r_scheme[2:end]
    703     userinfo, hostport = splituser(authority)
    704     if userinfo is not None:
    705         user, password = splitpasswd(userinfo)
    706     else:
    707         user = password = None
    708     return scheme, user, password, hostport
    709 
    710 class ProxyHandler(BaseHandler):
    711     # Proxies must be in front
    712     handler_order = 100
    713 
    714     def __init__(self, proxies=None):
    715         if proxies is None:
    716             proxies = getproxies()
    717         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
    718         self.proxies = proxies
    719         for type, url in proxies.items():
    720             setattr(self, '%s_open' % type,
    721                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
    722                     meth(r, proxy, type))
    723 
    724     def proxy_open(self, req, proxy, type):
    725         orig_type = req.get_type()
    726         proxy_type, user, password, hostport = _parse_proxy(proxy)
    727 
    728         if proxy_type is None:
    729             proxy_type = orig_type
    730 
    731         if req.host and proxy_bypass(req.host):
    732             return None
    733 
    734         if user and password:
    735             user_pass = '%s:%s' % (unquote(user), unquote(password))
    736             creds = base64.b64encode(user_pass).strip()
    737             req.add_header('Proxy-authorization', 'Basic ' + creds)
    738         hostport = unquote(hostport)
    739         req.set_proxy(hostport, proxy_type)
    740 
    741         if orig_type == proxy_type or orig_type == 'https':
    742             # let other handlers take care of it
    743             return None
    744         else:
    745             # need to start over, because the other handlers don't
    746             # grok the proxy's URL type
    747             # e.g. if we have a constructor arg proxies like so:
    748             # {'http': 'ftp://proxy.example.com'}, we may end up turning
    749             # a request for http://acme.example.com/a into one for
    750             # ftp://proxy.example.com/a
    751             return self.parent.open(req, timeout=req.timeout)
    752 
    753 class HTTPPasswordMgr:
    754 
    755     def __init__(self):
    756         self.passwd = {}
    757 
    758     def add_password(self, realm, uri, user, passwd):
    759         # uri could be a single URI or a sequence
    760         if isinstance(uri, basestring):
    761             uri = [uri]
    762         if not realm in self.passwd:
    763             self.passwd[realm] = {}
    764         for default_port in True, False:
    765             reduced_uri = tuple(
    766                 [self.reduce_uri(u, default_port) for u in uri])
    767             self.passwd[realm][reduced_uri] = (user, passwd)
    768 
    769     def find_user_password(self, realm, authuri):
    770         domains = self.passwd.get(realm, {})
    771         for default_port in True, False:
    772             reduced_authuri = self.reduce_uri(authuri, default_port)
    773             for uris, authinfo in domains.iteritems():
    774                 for uri in uris:
    775                     if self.is_suburi(uri, reduced_authuri):
    776                         return authinfo
    777         return None, None
    778 
    779     def reduce_uri(self, uri, default_port=True):
    780         """Accept authority or URI and extract only the authority and path."""
    781         # note HTTP URLs do not have a userinfo component
    782         parts = urlparse.urlsplit(uri)
    783         if parts[1]:
    784             # URI
    785             scheme = parts[0]
    786             authority = parts[1]
    787             path = parts[2] or '/'
    788         else:
    789             # host or host:port
    790             scheme = None
    791             authority = uri
    792             path = '/'
    793         host, port = splitport(authority)
    794         if default_port and port is None and scheme is not None:
    795             dport = {"http": 80,
    796                      "https": 443,
    797                      }.get(scheme)
    798             if dport is not None:
    799                 authority = "%s:%d" % (host, dport)
    800         return authority, path
    801 
    802     def is_suburi(self, base, test):
    803         """Check if test is below base in a URI tree
    804 
    805         Both args must be URIs in reduced form.
    806         """
    807         if base == test:
    808             return True
    809         if base[0] != test[0]:
    810             return False
    811         common = posixpath.commonprefix((base[1], test[1]))
    812         if len(common) == len(base[1]):
    813             return True
    814         return False
    815 
    816 
    817 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
    818 
    819     def find_user_password(self, realm, authuri):
    820         user, password = HTTPPasswordMgr.find_user_password(self, realm,
    821                                                             authuri)
    822         if user is not None:
    823             return user, password
    824         return HTTPPasswordMgr.find_user_password(self, None, authuri)
    825 
    826 
    827 class AbstractBasicAuthHandler:
    828 
    829     # XXX this allows for multiple auth-schemes, but will stupidly pick
    830     # the last one with a realm specified.
    831 
    832     # allow for double- and single-quoted realm values
    833     # (single quotes are a violation of the RFC, but appear in the wild)
    834     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
    835                     'realm=(["\']?)([^"\']*)\\2', re.I)
    836 
    837     # XXX could pre-emptively send auth info already accepted (RFC 2617,
    838     # end of section 2, and section 1.2 immediately after "credentials"
    839     # production).
    840 
    841     def __init__(self, password_mgr=None):
    842         if password_mgr is None:
    843             password_mgr = HTTPPasswordMgr()
    844         self.passwd = password_mgr
    845         self.add_password = self.passwd.add_password
    846         self.retried = 0
    847 
    848     def reset_retry_count(self):
    849         self.retried = 0
    850 
    851     def http_error_auth_reqed(self, authreq, host, req, headers):
    852         # host may be an authority (without userinfo) or a URL with an
    853         # authority
    854         # XXX could be multiple headers
    855         authreq = headers.get(authreq, None)
    856 
    857         if self.retried > 5:
    858             # retry sending the username:password 5 times before failing.
    859             raise HTTPError(req.get_full_url(), 401, "basic auth failed",
    860                             headers, None)
    861         else:
    862             self.retried += 1
    863 
    864         if authreq:
    865             mo = AbstractBasicAuthHandler.rx.search(authreq)
    866             if mo:
    867                 scheme, quote, realm = mo.groups()
    868                 if quote not in ['"', "'"]:
    869                     warnings.warn("Basic Auth Realm was unquoted",
    870                                   UserWarning, 2)
    871                 if scheme.lower() == 'basic':
    872                     response = self.retry_http_basic_auth(host, req, realm)
    873                     if response and response.code != 401:
    874                         self.retried = 0
    875                     return response
    876 
    877     def retry_http_basic_auth(self, host, req, realm):
    878         user, pw = self.passwd.find_user_password(realm, host)
    879         if pw is not None:
    880             raw = "%s:%s" % (user, pw)
    881             auth = 'Basic %s' % base64.b64encode(raw).strip()
    882             if req.headers.get(self.auth_header, None) == auth:
    883                 return None
    884             req.add_unredirected_header(self.auth_header, auth)
    885             return self.parent.open(req, timeout=req.timeout)
    886         else:
    887             return None
    888 
    889 
    890 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    891 
    892     auth_header = 'Authorization'
    893 
    894     def http_error_401(self, req, fp, code, msg, headers):
    895         url = req.get_full_url()
    896         response = self.http_error_auth_reqed('www-authenticate',
    897                                               url, req, headers)
    898         self.reset_retry_count()
    899         return response
    900 
    901 
    902 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    903 
    904     auth_header = 'Proxy-authorization'
    905 
    906     def http_error_407(self, req, fp, code, msg, headers):
    907         # http_error_auth_reqed requires that there is no userinfo component in
    908         # authority.  Assume there isn't one, since urllib2 does not (and
    909         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
    910         # userinfo.
    911         authority = req.get_host()
    912         response = self.http_error_auth_reqed('proxy-authenticate',
    913                                           authority, req, headers)
    914         self.reset_retry_count()
    915         return response
    916 
    917 
    918 def randombytes(n):
    919     """Return n random bytes."""
    920     # Use /dev/urandom if it is available.  Fall back to random module
    921     # if not.  It might be worthwhile to extend this function to use
    922     # other platform-specific mechanisms for getting random bytes.
    923     if os.path.exists("/dev/urandom"):
    924         f = open("/dev/urandom")
    925         s = f.read(n)
    926         f.close()
    927         return s
    928     else:
    929         L = [chr(random.randrange(0, 256)) for i in range(n)]
    930         return "".join(L)
    931 
    932 class AbstractDigestAuthHandler:
    933     # Digest authentication is specified in RFC 2617.
    934 
    935     # XXX The client does not inspect the Authentication-Info header
    936     # in a successful response.
    937 
    938     # XXX It should be possible to test this implementation against
    939     # a mock server that just generates a static set of challenges.
    940 
    941     # XXX qop="auth-int" supports is shaky
    942 
    943     def __init__(self, passwd=None):
    944         if passwd is None:
    945             passwd = HTTPPasswordMgr()
    946         self.passwd = passwd
    947         self.add_password = self.passwd.add_password
    948         self.retried = 0
    949         self.nonce_count = 0
    950         self.last_nonce = None
    951 
    952     def reset_retry_count(self):
    953         self.retried = 0
    954 
    955     def http_error_auth_reqed(self, auth_header, host, req, headers):
    956         authreq = headers.get(auth_header, None)
    957         if self.retried > 5:
    958             # Don't fail endlessly - if we failed once, we'll probably
    959             # fail a second time. Hm. Unless the Password Manager is
    960             # prompting for the information. Crap. This isn't great
    961             # but it's better than the current 'repeat until recursion
    962             # depth exceeded' approach <wink>
    963             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
    964                             headers, None)
    965         else:
    966             self.retried += 1
    967         if authreq:
    968             scheme = authreq.split()[0]
    969             if scheme.lower() == 'digest':
    970                 return self.retry_http_digest_auth(req, authreq)
    971 
    972     def retry_http_digest_auth(self, req, auth):
    973         token, challenge = auth.split(' ', 1)
    974         chal = parse_keqv_list(parse_http_list(challenge))
    975         auth = self.get_authorization(req, chal)
    976         if auth:
    977             auth_val = 'Digest %s' % auth
    978             if req.headers.get(self.auth_header, None) == auth_val:
    979                 return None
    980             req.add_unredirected_header(self.auth_header, auth_val)
    981             resp = self.parent.open(req, timeout=req.timeout)
    982             return resp
    983 
    984     def get_cnonce(self, nonce):
    985         # The cnonce-value is an opaque
    986         # quoted string value provided by the client and used by both client
    987         # and server to avoid chosen plaintext attacks, to provide mutual
    988         # authentication, and to provide some message integrity protection.
    989         # This isn't a fabulous effort, but it's probably Good Enough.
    990         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
    991                                             randombytes(8))).hexdigest()
    992         return dig[:16]
    993 
    994     def get_authorization(self, req, chal):
    995         try:
    996             realm = chal['realm']
    997             nonce = chal['nonce']
    998             qop = chal.get('qop')
    999             algorithm = chal.get('algorithm', 'MD5')
   1000             # mod_digest doesn't send an opaque, even though it isn't
   1001             # supposed to be optional
   1002             opaque = chal.get('opaque', None)
   1003         except KeyError:
   1004             return None
   1005 
   1006         H, KD = self.get_algorithm_impls(algorithm)
   1007         if H is None:
   1008             return None
   1009 
   1010         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
   1011         if user is None:
   1012             return None
   1013 
   1014         # XXX not implemented yet
   1015         if req.has_data():
   1016             entdig = self.get_entity_digest(req.get_data(), chal)
   1017         else:
   1018             entdig = None
   1019 
   1020         A1 = "%s:%s:%s" % (user, realm, pw)
   1021         A2 = "%s:%s" % (req.get_method(),
   1022                         # XXX selector: what about proxies and full urls
   1023                         req.get_selector())
   1024         if qop == 'auth':
   1025             if nonce == self.last_nonce:
   1026                 self.nonce_count += 1
   1027             else:
   1028                 self.nonce_count = 1
   1029                 self.last_nonce = nonce
   1030 
   1031             ncvalue = '%08x' % self.nonce_count
   1032             cnonce = self.get_cnonce(nonce)
   1033             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
   1034             respdig = KD(H(A1), noncebit)
   1035         elif qop is None:
   1036             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
   1037         else:
   1038             # XXX handle auth-int.
   1039             raise URLError("qop '%s' is not supported." % qop)
   1040 
   1041         # XXX should the partial digests be encoded too?
   1042 
   1043         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
   1044                'response="%s"' % (user, realm, nonce, req.get_selector(),
   1045                                   respdig)
   1046         if opaque:
   1047             base += ', opaque="%s"' % opaque
   1048         if entdig:
   1049             base += ', digest="%s"' % entdig
   1050         base += ', algorithm="%s"' % algorithm
   1051         if qop:
   1052             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
   1053         return base
   1054 
   1055     def get_algorithm_impls(self, algorithm):
   1056         # algorithm should be case-insensitive according to RFC2617
   1057         algorithm = algorithm.upper()
   1058         # lambdas assume digest modules are imported at the top level
   1059         if algorithm == 'MD5':
   1060             H = lambda x: hashlib.md5(x).hexdigest()
   1061         elif algorithm == 'SHA':
   1062             H = lambda x: hashlib.sha1(x).hexdigest()
   1063         # XXX MD5-sess
   1064         KD = lambda s, d: H("%s:%s" % (s, d))
   1065         return H, KD
   1066 
   1067     def get_entity_digest(self, data, chal):
   1068         # XXX not implemented yet
   1069         return None
   1070 
   1071 
   1072 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1073     """An authentication protocol defined by RFC 2069
   1074 
   1075     Digest authentication improves on basic authentication because it
   1076     does not transmit passwords in the clear.
   1077     """
   1078 
   1079     auth_header = 'Authorization'
   1080     handler_order = 490  # before Basic auth
   1081 
   1082     def http_error_401(self, req, fp, code, msg, headers):
   1083         host = urlparse.urlparse(req.get_full_url())[1]
   1084         retry = self.http_error_auth_reqed('www-authenticate',
   1085                                            host, req, headers)
   1086         self.reset_retry_count()
   1087         return retry
   1088 
   1089 
   1090 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1091 
   1092     auth_header = 'Proxy-Authorization'
   1093     handler_order = 490  # before Basic auth
   1094 
   1095     def http_error_407(self, req, fp, code, msg, headers):
   1096         host = req.get_host()
   1097         retry = self.http_error_auth_reqed('proxy-authenticate',
   1098                                            host, req, headers)
   1099         self.reset_retry_count()
   1100         return retry
   1101 
   1102 class AbstractHTTPHandler(BaseHandler):
   1103 
   1104     def __init__(self, debuglevel=0):
   1105         self._debuglevel = debuglevel
   1106 
   1107     def set_http_debuglevel(self, level):
   1108         self._debuglevel = level
   1109 
   1110     def do_request_(self, request):
   1111         host = request.get_host()
   1112         if not host:
   1113             raise URLError('no host given')
   1114 
   1115         if request.has_data():  # POST
   1116             data = request.get_data()
   1117             if not request.has_header('Content-type'):
   1118                 request.add_unredirected_header(
   1119                     'Content-type',
   1120                     'application/x-www-form-urlencoded')
   1121             if not request.has_header('Content-length'):
   1122                 request.add_unredirected_header(
   1123                     'Content-length', '%d' % len(data))
   1124 
   1125         sel_host = host
   1126         if request.has_proxy():
   1127             scheme, sel = splittype(request.get_selector())
   1128             sel_host, sel_path = splithost(sel)
   1129 
   1130         if not request.has_header('Host'):
   1131             request.add_unredirected_header('Host', sel_host)
   1132         for name, value in self.parent.addheaders:
   1133             name = name.capitalize()
   1134             if not request.has_header(name):
   1135                 request.add_unredirected_header(name, value)
   1136 
   1137         return request
   1138 
   1139     def do_open(self, http_class, req):
   1140         """Return an addinfourl object for the request, using http_class.
   1141 
   1142         http_class must implement the HTTPConnection API from httplib.
   1143         The addinfourl return value is a file-like object.  It also
   1144         has methods and attributes including:
   1145             - info(): return a mimetools.Message object for the headers
   1146             - geturl(): return the original request URL
   1147             - code: HTTP status code
   1148         """
   1149         host = req.get_host()
   1150         if not host:
   1151             raise URLError('no host given')
   1152 
   1153         h = http_class(host, timeout=req.timeout) # will parse host:port
   1154         h.set_debuglevel(self._debuglevel)
   1155 
   1156         headers = dict(req.unredirected_hdrs)
   1157         headers.update(dict((k, v) for k, v in req.headers.items()
   1158                             if k not in headers))
   1159 
   1160         # We want to make an HTTP/1.1 request, but the addinfourl
   1161         # class isn't prepared to deal with a persistent connection.
   1162         # It will try to read all remaining data from the socket,
   1163         # which will block while the server waits for the next request.
   1164         # So make sure the connection gets closed after the (only)
   1165         # request.
   1166         headers["Connection"] = "close"
   1167         headers = dict(
   1168             (name.title(), val) for name, val in headers.items())
   1169 
   1170         if req._tunnel_host:
   1171             tunnel_headers = {}
   1172             proxy_auth_hdr = "Proxy-Authorization"
   1173             if proxy_auth_hdr in headers:
   1174                 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
   1175                 # Proxy-Authorization should not be sent to origin
   1176                 # server.
   1177                 del headers[proxy_auth_hdr]
   1178             h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
   1179 
   1180         try:
   1181             h.request(req.get_method(), req.get_selector(), req.data, headers)
   1182         except socket.error, err: # XXX what error?
   1183             h.close()
   1184             raise URLError(err)
   1185         else:
   1186             try:
   1187                 r = h.getresponse(buffering=True)
   1188             except TypeError: # buffering kw not supported
   1189                 r = h.getresponse()
   1190 
   1191         # Pick apart the HTTPResponse object to get the addinfourl
   1192         # object initialized properly.
   1193 
   1194         # Wrap the HTTPResponse object in socket's file object adapter
   1195         # for Windows.  That adapter calls recv(), so delegate recv()
   1196         # to read().  This weird wrapping allows the returned object to
   1197         # have readline() and readlines() methods.
   1198 
   1199         # XXX It might be better to extract the read buffering code
   1200         # out of socket._fileobject() and into a base class.
   1201 
   1202         r.recv = r.read
   1203         fp = socket._fileobject(r, close=True)
   1204 
   1205         resp = addinfourl(fp, r.msg, req.get_full_url())
   1206         resp.code = r.status
   1207         resp.msg = r.reason
   1208         return resp
   1209 
   1210 
   1211 class HTTPHandler(AbstractHTTPHandler):
   1212 
   1213     def http_open(self, req):
   1214         return self.do_open(httplib.HTTPConnection, req)
   1215 
   1216     http_request = AbstractHTTPHandler.do_request_
   1217 
   1218 if hasattr(httplib, 'HTTPS'):
   1219     class HTTPSHandler(AbstractHTTPHandler):
   1220 
   1221         def https_open(self, req):
   1222             return self.do_open(httplib.HTTPSConnection, req)
   1223 
   1224         https_request = AbstractHTTPHandler.do_request_
   1225 
   1226 class HTTPCookieProcessor(BaseHandler):
   1227     def __init__(self, cookiejar=None):
   1228         import cookielib
   1229         if cookiejar is None:
   1230             cookiejar = cookielib.CookieJar()
   1231         self.cookiejar = cookiejar
   1232 
   1233     def http_request(self, request):
   1234         self.cookiejar.add_cookie_header(request)
   1235         return request
   1236 
   1237     def http_response(self, request, response):
   1238         self.cookiejar.extract_cookies(response, request)
   1239         return response
   1240 
   1241     https_request = http_request
   1242     https_response = http_response
   1243 
   1244 class UnknownHandler(BaseHandler):
   1245     def unknown_open(self, req):
   1246         type = req.get_type()
   1247         raise URLError('unknown url type: %s' % type)
   1248 
   1249 def parse_keqv_list(l):
   1250     """Parse list of key=value strings where keys are not duplicated."""
   1251     parsed = {}
   1252     for elt in l:
   1253         k, v = elt.split('=', 1)
   1254         if v[0] == '"' and v[-1] == '"':
   1255             v = v[1:-1]
   1256         parsed[k] = v
   1257     return parsed
   1258 
   1259 def parse_http_list(s):
   1260     """Parse lists as described by RFC 2068 Section 2.
   1261 
   1262     In particular, parse comma-separated lists where the elements of
   1263     the list may include quoted-strings.  A quoted-string could
   1264     contain a comma.  A non-quoted string could have quotes in the
   1265     middle.  Neither commas nor quotes count if they are escaped.
   1266     Only double-quotes count, not single-quotes.
   1267     """
   1268     res = []
   1269     part = ''
   1270 
   1271     escape = quote = False
   1272     for cur in s:
   1273         if escape:
   1274             part += cur
   1275             escape = False
   1276             continue
   1277         if quote:
   1278             if cur == '\\':
   1279                 escape = True
   1280                 continue
   1281             elif cur == '"':
   1282                 quote = False
   1283             part += cur
   1284             continue
   1285 
   1286         if cur == ',':
   1287             res.append(part)
   1288             part = ''
   1289             continue
   1290 
   1291         if cur == '"':
   1292             quote = True
   1293 
   1294         part += cur
   1295 
   1296     # append last part
   1297     if part:
   1298         res.append(part)
   1299 
   1300     return [part.strip() for part in res]
   1301 
   1302 def _safe_gethostbyname(host):
   1303     try:
   1304         return socket.gethostbyname(host)
   1305     except socket.gaierror:
   1306         return None
   1307 
   1308 class FileHandler(BaseHandler):
   1309     # Use local file or FTP depending on form of URL
   1310     def file_open(self, req):
   1311         url = req.get_selector()
   1312         if url[:2] == '//' and url[2:3] != '/' and (req.host and
   1313                 req.host != 'localhost'):
   1314             req.type = 'ftp'
   1315             return self.parent.open(req)
   1316         else:
   1317             return self.open_local_file(req)
   1318 
   1319     # names for the localhost
   1320     names = None
   1321     def get_names(self):
   1322         if FileHandler.names is None:
   1323             try:
   1324                 FileHandler.names = tuple(
   1325                     socket.gethostbyname_ex('localhost')[2] +
   1326                     socket.gethostbyname_ex(socket.gethostname())[2])
   1327             except socket.gaierror:
   1328                 FileHandler.names = (socket.gethostbyname('localhost'),)
   1329         return FileHandler.names
   1330 
   1331     # not entirely sure what the rules are here
   1332     def open_local_file(self, req):
   1333         import email.utils
   1334         import mimetypes
   1335         host = req.get_host()
   1336         filename = req.get_selector()
   1337         localfile = url2pathname(filename)
   1338         try:
   1339             stats = os.stat(localfile)
   1340             size = stats.st_size
   1341             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
   1342             mtype = mimetypes.guess_type(filename)[0]
   1343             headers = mimetools.Message(StringIO(
   1344                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
   1345                 (mtype or 'text/plain', size, modified)))
   1346             if host:
   1347                 host, port = splitport(host)
   1348             if not host or \
   1349                 (not port and _safe_gethostbyname(host) in self.get_names()):
   1350                 if host:
   1351                     origurl = 'file://' + host + filename
   1352                 else:
   1353                     origurl = 'file://' + filename
   1354                 return addinfourl(open(localfile, 'rb'), headers, origurl)
   1355         except OSError, msg:
   1356             # urllib2 users shouldn't expect OSErrors coming from urlopen()
   1357             raise URLError(msg)
   1358         raise URLError('file not on local host')
   1359 
   1360 class FTPHandler(BaseHandler):
   1361     def ftp_open(self, req):
   1362         import ftplib
   1363         import mimetypes
   1364         host = req.get_host()
   1365         if not host:
   1366             raise URLError('ftp error: no host given')
   1367         host, port = splitport(host)
   1368         if port is None:
   1369             port = ftplib.FTP_PORT
   1370         else:
   1371             port = int(port)
   1372 
   1373         # username/password handling
   1374         user, host = splituser(host)
   1375         if user:
   1376             user, passwd = splitpasswd(user)
   1377         else:
   1378             passwd = None
   1379         host = unquote(host)
   1380         user = user or ''
   1381         passwd = passwd or ''
   1382 
   1383         try:
   1384             host = socket.gethostbyname(host)
   1385         except socket.error, msg:
   1386             raise URLError(msg)
   1387         path, attrs = splitattr(req.get_selector())
   1388         dirs = path.split('/')
   1389         dirs = map(unquote, dirs)
   1390         dirs, file = dirs[:-1], dirs[-1]
   1391         if dirs and not dirs[0]:
   1392             dirs = dirs[1:]
   1393         try:
   1394             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
   1395             type = file and 'I' or 'D'
   1396             for attr in attrs:
   1397                 attr, value = splitvalue(attr)
   1398                 if attr.lower() == 'type' and \
   1399                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
   1400                     type = value.upper()
   1401             fp, retrlen = fw.retrfile(file, type)
   1402             headers = ""
   1403             mtype = mimetypes.guess_type(req.get_full_url())[0]
   1404             if mtype:
   1405                 headers += "Content-type: %s\n" % mtype
   1406             if retrlen is not None and retrlen >= 0:
   1407                 headers += "Content-length: %d\n" % retrlen
   1408             sf = StringIO(headers)
   1409             headers = mimetools.Message(sf)
   1410             return addinfourl(fp, headers, req.get_full_url())
   1411         except ftplib.all_errors, msg:
   1412             raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
   1413 
   1414     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1415         fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
   1416                         persistent=False)
   1417 ##        fw.ftp.set_debuglevel(1)
   1418         return fw
   1419 
   1420 class CacheFTPHandler(FTPHandler):
   1421     # XXX would be nice to have pluggable cache strategies
   1422     # XXX this stuff is definitely not thread safe
   1423     def __init__(self):
   1424         self.cache = {}
   1425         self.timeout = {}
   1426         self.soonest = 0
   1427         self.delay = 60
   1428         self.max_conns = 16
   1429 
   1430     def setTimeout(self, t):
   1431         self.delay = t
   1432 
   1433     def setMaxConns(self, m):
   1434         self.max_conns = m
   1435 
   1436     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1437         key = user, host, port, '/'.join(dirs), timeout
   1438         if key in self.cache:
   1439             self.timeout[key] = time.time() + self.delay
   1440         else:
   1441             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
   1442             self.timeout[key] = time.time() + self.delay
   1443         self.check_cache()
   1444         return self.cache[key]
   1445 
   1446     def check_cache(self):
   1447         # first check for old ones
   1448         t = time.time()
   1449         if self.soonest <= t:
   1450             for k, v in self.timeout.items():
   1451                 if v < t:
   1452                     self.cache[k].close()
   1453                     del self.cache[k]
   1454                     del self.timeout[k]
   1455         self.soonest = min(self.timeout.values())
   1456 
   1457         # then check the size
   1458         if len(self.cache) == self.max_conns:
   1459             for k, v in self.timeout.items():
   1460                 if v == self.soonest:
   1461                     del self.cache[k]
   1462                     del self.timeout[k]
   1463                     break
   1464             self.soonest = min(self.timeout.values())
   1465 
   1466     def clear_cache(self):
   1467         for conn in self.cache.values():
   1468             conn.close()
   1469         self.cache.clear()
   1470         self.timeout.clear()
   1471