Home | History | Annotate | Download | only in Lib
      1 """An extensible library for opening URLs using a variety of protocols
      2 
      3 The simplest way to use this module is to call the urlopen function,
      4 which accepts a string containing a URL or a Request object (described
      5 below).  It opens the URL and returns the results as file-like
      6 object; the returned object has some extra methods described below.
      7 
      8 The OpenerDirector manages a collection of Handler objects that do
      9 all the actual work.  Each Handler implements a particular protocol or
     10 option.  The OpenerDirector is a composite object that invokes the
     11 Handlers needed to open the requested URL.  For example, the
     12 HTTPHandler performs HTTP GET and POST requests and deals with
     13 non-error returns.  The HTTPRedirectHandler automatically deals with
     14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
     15 deals with digest authentication.
     16 
     17 urlopen(url, data=None) -- Basic usage is the same as original
     18 urllib.  pass the url and optionally data to post to an HTTP URL, and
     19 get a file-like object back.  One difference is that you can also pass
     20 a Request instance instead of URL.  Raises a URLError (subclass of
     21 IOError); for HTTP errors, raises an HTTPError, which can also be
     22 treated as a valid response.
     23 
     24 build_opener -- Function that creates a new OpenerDirector instance.
     25 Will install the default handlers.  Accepts one or more Handlers as
     26 arguments, either instances or Handler classes that it will
     27 instantiate.  If one of the argument is a subclass of the default
     28 handler, the argument will be installed instead of the default.
     29 
     30 install_opener -- Installs a new opener as the default opener.
     31 
     32 objects of interest:
     33 
     34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
     35 the Handler classes, while dealing with requests and responses.
     36 
     37 Request -- An object that encapsulates the state of a request.  The
     38 state can be as simple as the URL.  It can also include extra HTTP
     39 headers, e.g. a User-Agent.
     40 
     41 BaseHandler --
     42 
     43 exceptions:
     44 URLError -- A subclass of IOError, individual protocols have their own
     45 specific subclass.
     46 
     47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
     48 as an exceptional event or valid response.
     49 
     50 internals:
     51 BaseHandler and parent
     52 _call_chain conventions
     53 
     54 Example usage:
     55 
     56 import urllib2
     57 
     58 # set up authentication info
     59 authinfo = urllib2.HTTPBasicAuthHandler()
     60 authinfo.add_password(realm='PDQ Application',
     61                       uri='https://mahler:8092/site-updates.py',
     62                       user='klem',
     63                       passwd='geheim$parole')
     64 
     65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
     66 
     67 # build a new opener that adds authentication and caching FTP handlers
     68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
     69 
     70 # install it
     71 urllib2.install_opener(opener)
     72 
     73 f = urllib2.urlopen('http://www.python.org/')
     74 
     75 
     76 """
     77 
     78 # XXX issues:
     79 # If an authentication error handler that tries to perform
     80 # authentication for some reason but fails, how should the error be
     81 # signalled?  The client needs to know the HTTP error code.  But if
     82 # the handler knows that the problem was, e.g., that it didn't know
     83 # that hash algo that requested in the challenge, it would be good to
     84 # pass that information along to the client, too.
     85 # ftp errors aren't handled cleanly
     86 # check digest against correct (i.e. non-apache) implementation
     87 
     88 # Possible extensions:
     89 # complex proxies  XXX not sure what exactly was meant by this
     90 # abstract factory for opener
     91 
     92 import base64
     93 import hashlib
     94 import httplib
     95 import mimetools
     96 import os
     97 import posixpath
     98 import random
     99 import re
    100 import socket
    101 import sys
    102 import time
    103 import urlparse
    104 import bisect
    105 import warnings
    106 
    107 try:
    108     from cStringIO import StringIO
    109 except ImportError:
    110     from StringIO import StringIO
    111 
    112 # check for SSL
    113 try:
    114     import ssl
    115 except ImportError:
    116     _have_ssl = False
    117 else:
    118     _have_ssl = True
    119 
    120 from urllib import (unwrap, unquote, splittype, splithost, quote,
    121      addinfourl, splitport, splittag, toBytes,
    122      splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
    123 
    124 # support for FileHandler, proxies via environment variables
    125 from urllib import localhost, url2pathname, getproxies, proxy_bypass
    126 
    127 # used in User-Agent header sent
    128 __version__ = sys.version[:3]
    129 
    130 _opener = None
    131 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
    132             cafile=None, capath=None, cadefault=False, context=None):
    133     global _opener
    134     if cafile or capath or cadefault:
    135         if context is not None:
    136             raise ValueError(
    137                 "You can't pass both context and any of cafile, capath, and "
    138                 "cadefault"
    139             )
    140         if not _have_ssl:
    141             raise ValueError('SSL support not available')
    142         context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH,
    143                                              cafile=cafile,
    144                                              capath=capath)
    145         https_handler = HTTPSHandler(context=context)
    146         opener = build_opener(https_handler)
    147     elif context:
    148         https_handler = HTTPSHandler(context=context)
    149         opener = build_opener(https_handler)
    150     elif _opener is None:
    151         _opener = opener = build_opener()
    152     else:
    153         opener = _opener
    154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):
    157     global _opener
    158     _opener = opener
    159 
    160 # do these error classes make sense?
    161 # make sure all of the IOError stuff is overridden.  we just want to be
    162 # subtypes.
    163 
    164 class URLError(IOError):
    165     # URLError is a sub-type of IOError, but it doesn't share any of
    166     # the implementation.  need to override __init__ and __str__.
    167     # It sets self.args for compatibility with other EnvironmentError
    168     # subclasses, but args doesn't have the typical format with errno in
    169     # slot 0 and strerror in slot 1.  This may be better than nothing.
    170     def __init__(self, reason):
    171         self.args = reason,
    172         self.reason = reason
    173 
    174     def __str__(self):
    175         return '<urlopen error %s>' % self.reason
    176 
    177 class HTTPError(URLError, addinfourl):
    178     """Raised when HTTP error occurs, but also acts like non-error return"""
    179     __super_init = addinfourl.__init__
    180 
    181     def __init__(self, url, code, msg, hdrs, fp):
    182         self.code = code
    183         self.msg = msg
    184         self.hdrs = hdrs
    185         self.fp = fp
    186         self.filename = url
    187         # The addinfourl classes depend on fp being a valid file
    188         # object.  In some cases, the HTTPError may not have a valid
    189         # file object.  If this happens, the simplest workaround is to
    190         # not initialize the base classes.
    191         if fp is not None:
    192             self.__super_init(fp, hdrs, url, code)
    193 
    194     def __str__(self):
    195         return 'HTTP Error %s: %s' % (self.code, self.msg)
    196 
    197     # since URLError specifies a .reason attribute, HTTPError should also
    198     #  provide this attribute. See issue13211 fo discussion.
    199     @property
    200     def reason(self):
    201         return self.msg
    202 
    203     def info(self):
    204         return self.hdrs
    205 
    206 # copied from cookielib.py
    207 _cut_port_re = re.compile(r":\d+$")
    208 def request_host(request):
    209     """Return request-host, as defined by RFC 2965.
    210 
    211     Variation from RFC: returned value is lowercased, for convenient
    212     comparison.
    213 
    214     """
    215     url = request.get_full_url()
    216     host = urlparse.urlparse(url)[1]
    217     if host == "":
    218         host = request.get_header("Host", "")
    219 
    220     # remove port, if present
    221     host = _cut_port_re.sub("", host, 1)
    222     return host.lower()
    223 
    224 class Request:
    225 
    226     def __init__(self, url, data=None, headers={},
    227                  origin_req_host=None, unverifiable=False):
    228         # unwrap('<URL:type://host/path>') --> 'type://host/path'
    229         self.__original = unwrap(url)
    230         self.__original, self.__fragment = splittag(self.__original)
    231         self.type = None
    232         # self.__r_type is what's left after doing the splittype
    233         self.host = None
    234         self.port = None
    235         self._tunnel_host = None
    236         self.data = data
    237         self.headers = {}
    238         for key, value in headers.items():
    239             self.add_header(key, value)
    240         self.unredirected_hdrs = {}
    241         if origin_req_host is None:
    242             origin_req_host = request_host(self)
    243         self.origin_req_host = origin_req_host
    244         self.unverifiable = unverifiable
    245 
    246     def __getattr__(self, attr):
    247         # XXX this is a fallback mechanism to guard against these
    248         # methods getting called in a non-standard order.  this may be
    249         # too complicated and/or unnecessary.
    250         # XXX should the __r_XXX attributes be public?
    251         if attr in ('_Request__r_type', '_Request__r_host'):
    252             getattr(self, 'get_' + attr[12:])()
    253             return self.__dict__[attr]
    254         raise AttributeError, attr
    255 
    256     def get_method(self):
    257         if self.has_data():
    258             return "POST"
    259         else:
    260             return "GET"
    261 
    262     # XXX these helper methods are lame
    263 
    264     def add_data(self, data):
    265         self.data = data
    266 
    267     def has_data(self):
    268         return self.data is not None
    269 
    270     def get_data(self):
    271         return self.data
    272 
    273     def get_full_url(self):
    274         if self.__fragment:
    275             return '%s#%s' % (self.__original, self.__fragment)
    276         else:
    277             return self.__original
    278 
    279     def get_type(self):
    280         if self.type is None:
    281             self.type, self.__r_type = splittype(self.__original)
    282             if self.type is None:
    283                 raise ValueError, "unknown url type: %s" % self.__original
    284         return self.type
    285 
    286     def get_host(self):
    287         if self.host is None:
    288             self.host, self.__r_host = splithost(self.__r_type)
    289             if self.host:
    290                 self.host = unquote(self.host)
    291         return self.host
    292 
    293     def get_selector(self):
    294         return self.__r_host
    295 
    296     def set_proxy(self, host, type):
    297         if self.type == 'https' and not self._tunnel_host:
    298             self._tunnel_host = self.host
    299         else:
    300             self.type = type
    301             self.__r_host = self.__original
    302 
    303         self.host = host
    304 
    305     def has_proxy(self):
    306         return self.__r_host == self.__original
    307 
    308     def get_origin_req_host(self):
    309         return self.origin_req_host
    310 
    311     def is_unverifiable(self):
    312         return self.unverifiable
    313 
    314     def add_header(self, key, val):
    315         # useful for something like authentication
    316         self.headers[key.capitalize()] = val
    317 
    318     def add_unredirected_header(self, key, val):
    319         # will not be added to a redirected request
    320         self.unredirected_hdrs[key.capitalize()] = val
    321 
    322     def has_header(self, header_name):
    323         return (header_name in self.headers or
    324                 header_name in self.unredirected_hdrs)
    325 
    326     def get_header(self, header_name, default=None):
    327         return self.headers.get(
    328             header_name,
    329             self.unredirected_hdrs.get(header_name, default))
    330 
    331     def header_items(self):
    332         hdrs = self.unredirected_hdrs.copy()
    333         hdrs.update(self.headers)
    334         return hdrs.items()
    335 
    336 class OpenerDirector:
    337     def __init__(self):
    338         client_version = "Python-urllib/%s" % __version__
    339         self.addheaders = [('User-agent', client_version)]
    340         # self.handlers is retained only for backward compatibility
    341         self.handlers = []
    342         # manage the individual handlers
    343         self.handle_open = {}
    344         self.handle_error = {}
    345         self.process_response = {}
    346         self.process_request = {}
    347 
    348     def add_handler(self, handler):
    349         if not hasattr(handler, "add_parent"):
    350             raise TypeError("expected BaseHandler instance, got %r" %
    351                             type(handler))
    352 
    353         added = False
    354         for meth in dir(handler):
    355             if meth in ["redirect_request", "do_open", "proxy_open"]:
    356                 # oops, coincidental match
    357                 continue
    358 
    359             i = meth.find("_")
    360             protocol = meth[:i]
    361             condition = meth[i+1:]
    362 
    363             if condition.startswith("error"):
    364                 j = condition.find("_") + i + 1
    365                 kind = meth[j+1:]
    366                 try:
    367                     kind = int(kind)
    368                 except ValueError:
    369                     pass
    370                 lookup = self.handle_error.get(protocol, {})
    371                 self.handle_error[protocol] = lookup
    372             elif condition == "open":
    373                 kind = protocol
    374                 lookup = self.handle_open
    375             elif condition == "response":
    376                 kind = protocol
    377                 lookup = self.process_response
    378             elif condition == "request":
    379                 kind = protocol
    380                 lookup = self.process_request
    381             else:
    382                 continue
    383 
    384             handlers = lookup.setdefault(kind, [])
    385             if handlers:
    386                 bisect.insort(handlers, handler)
    387             else:
    388                 handlers.append(handler)
    389             added = True
    390 
    391         if added:
    392             bisect.insort(self.handlers, handler)
    393             handler.add_parent(self)
    394 
    395     def close(self):
    396         # Only exists for backwards compatibility.
    397         pass
    398 
    399     def _call_chain(self, chain, kind, meth_name, *args):
    400         # Handlers raise an exception if no one else should try to handle
    401         # the request, or return None if they can't but another handler
    402         # could.  Otherwise, they return the response.
    403         handlers = chain.get(kind, ())
    404         for handler in handlers:
    405             func = getattr(handler, meth_name)
    406 
    407             result = func(*args)
    408             if result is not None:
    409                 return result
    410 
    411     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    412         # accept a URL or a Request object
    413         if isinstance(fullurl, basestring):
    414             req = Request(fullurl, data)
    415         else:
    416             req = fullurl
    417             if data is not None:
    418                 req.add_data(data)
    419 
    420         req.timeout = timeout
    421         protocol = req.get_type()
    422 
    423         # pre-process request
    424         meth_name = protocol+"_request"
    425         for processor in self.process_request.get(protocol, []):
    426             meth = getattr(processor, meth_name)
    427             req = meth(req)
    428 
    429         response = self._open(req, data)
    430 
    431         # post-process response
    432         meth_name = protocol+"_response"
    433         for processor in self.process_response.get(protocol, []):
    434             meth = getattr(processor, meth_name)
    435             response = meth(req, response)
    436 
    437         return response
    438 
    439     def _open(self, req, data=None):
    440         result = self._call_chain(self.handle_open, 'default',
    441                                   'default_open', req)
    442         if result:
    443             return result
    444 
    445         protocol = req.get_type()
    446         result = self._call_chain(self.handle_open, protocol, protocol +
    447                                   '_open', req)
    448         if result:
    449             return result
    450 
    451         return self._call_chain(self.handle_open, 'unknown',
    452                                 'unknown_open', req)
    453 
    454     def error(self, proto, *args):
    455         if proto in ('http', 'https'):
    456             # XXX http[s] protocols are special-cased
    457             dict = self.handle_error['http'] # https is not different than http
    458             proto = args[2]  # YUCK!
    459             meth_name = 'http_error_%s' % proto
    460             http_err = 1
    461             orig_args = args
    462         else:
    463             dict = self.handle_error
    464             meth_name = proto + '_error'
    465             http_err = 0
    466         args = (dict, proto, meth_name) + args
    467         result = self._call_chain(*args)
    468         if result:
    469             return result
    470 
    471         if http_err:
    472             args = (dict, 'default', 'http_error_default') + orig_args
    473             return self._call_chain(*args)
    474 
    475 # XXX probably also want an abstract factory that knows when it makes
    476 # sense to skip a superclass in favor of a subclass and when it might
    477 # make sense to include both
    478 
    479 def build_opener(*handlers):
    480     """Create an opener object from a list of handlers.
    481 
    482     The opener will use several default handlers, including support
    483     for HTTP, FTP and when applicable, HTTPS.
    484 
    485     If any of the handlers passed as arguments are subclasses of the
    486     default handlers, the default handlers will not be used.
    487     """
    488     import types
    489     def isclass(obj):
    490         return isinstance(obj, (types.ClassType, type))
    491 
    492     opener = OpenerDirector()
    493     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
    494                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
    495                        FTPHandler, FileHandler, HTTPErrorProcessor]
    496     if hasattr(httplib, 'HTTPS'):
    497         default_classes.append(HTTPSHandler)
    498     skip = set()
    499     for klass in default_classes:
    500         for check in handlers:
    501             if isclass(check):
    502                 if issubclass(check, klass):
    503                     skip.add(klass)
    504             elif isinstance(check, klass):
    505                 skip.add(klass)
    506     for klass in skip:
    507         default_classes.remove(klass)
    508 
    509     for klass in default_classes:
    510         opener.add_handler(klass())
    511 
    512     for h in handlers:
    513         if isclass(h):
    514             h = h()
    515         opener.add_handler(h)
    516     return opener
    517 
    518 class BaseHandler:
    519     handler_order = 500
    520 
    521     def add_parent(self, parent):
    522         self.parent = parent
    523 
    524     def close(self):
    525         # Only exists for backwards compatibility
    526         pass
    527 
    528     def __lt__(self, other):
    529         if not hasattr(other, "handler_order"):
    530             # Try to preserve the old behavior of having custom classes
    531             # inserted after default ones (works only for custom user
    532             # classes which are not aware of handler_order).
    533             return True
    534         return self.handler_order < other.handler_order
    535 
    536 
    537 class HTTPErrorProcessor(BaseHandler):
    538     """Process HTTP error responses."""
    539     handler_order = 1000  # after all other processing
    540 
    541     def http_response(self, request, response):
    542         code, msg, hdrs = response.code, response.msg, response.info()
    543 
    544         # According to RFC 2616, "2xx" code indicates that the client's
    545         # request was successfully received, understood, and accepted.
    546         if not (200 <= code < 300):
    547             response = self.parent.error(
    548                 'http', request, response, code, msg, hdrs)
    549 
    550         return response
    551 
    552     https_response = http_response
    553 
    554 class HTTPDefaultErrorHandler(BaseHandler):
    555     def http_error_default(self, req, fp, code, msg, hdrs):
    556         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
    557 
    558 class HTTPRedirectHandler(BaseHandler):
    559     # maximum number of redirections to any single URL
    560     # this is needed because of the state that cookies introduce
    561     max_repeats = 4
    562     # maximum total number of redirections (regardless of URL) before
    563     # assuming we're in a loop
    564     max_redirections = 10
    565 
    566     def redirect_request(self, req, fp, code, msg, headers, newurl):
    567         """Return a Request or None in response to a redirect.
    568 
    569         This is called by the http_error_30x methods when a
    570         redirection response is received.  If a redirection should
    571         take place, return a new Request to allow http_error_30x to
    572         perform the redirect.  Otherwise, raise HTTPError if no-one
    573         else should try to handle this url.  Return None if you can't
    574         but another Handler might.
    575         """
    576         m = req.get_method()
    577         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
    578             or code in (301, 302, 303) and m == "POST"):
    579             # Strictly (according to RFC 2616), 301 or 302 in response
    580             # to a POST MUST NOT cause a redirection without confirmation
    581             # from the user (of urllib2, in this case).  In practice,
    582             # essentially all clients do redirect in this case, so we
    583             # do the same.
    584             # be conciliant with URIs containing a space
    585             newurl = newurl.replace(' ', '%20')
    586             newheaders = dict((k,v) for k,v in req.headers.items()
    587                               if k.lower() not in ("content-length", "content-type")
    588                              )
    589             return Request(newurl,
    590                            headers=newheaders,
    591                            origin_req_host=req.get_origin_req_host(),
    592                            unverifiable=True)
    593         else:
    594             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
    595 
    596     # Implementation note: To avoid the server sending us into an
    597     # infinite loop, the request object needs to track what URLs we
    598     # have already seen.  Do this by adding a handler-specific
    599     # attribute to the Request object.
    600     def http_error_302(self, req, fp, code, msg, headers):
    601         # Some servers (incorrectly) return multiple Location headers
    602         # (so probably same goes for URI).  Use first header.
    603         if 'location' in headers:
    604             newurl = headers.getheaders('location')[0]
    605         elif 'uri' in headers:
    606             newurl = headers.getheaders('uri')[0]
    607         else:
    608             return
    609 
    610         # fix a possible malformed URL
    611         urlparts = urlparse.urlparse(newurl)
    612         if not urlparts.path and urlparts.netloc:
    613             urlparts = list(urlparts)
    614             urlparts[2] = "/"
    615         newurl = urlparse.urlunparse(urlparts)
    616 
    617         newurl = urlparse.urljoin(req.get_full_url(), newurl)
    618 
    619         # For security reasons we do not allow redirects to protocols
    620         # other than HTTP, HTTPS or FTP.
    621         newurl_lower = newurl.lower()
    622         if not (newurl_lower.startswith('http://') or
    623                 newurl_lower.startswith('https://') or
    624                 newurl_lower.startswith('ftp://')):
    625             raise HTTPError(newurl, code,
    626                             msg + " - Redirection to url '%s' is not allowed" %
    627                             newurl,
    628                             headers, fp)
    629 
    630         # XXX Probably want to forget about the state of the current
    631         # request, although that might interact poorly with other
    632         # handlers that also use handler-specific request attributes
    633         new = self.redirect_request(req, fp, code, msg, headers, newurl)
    634         if new is None:
    635             return
    636 
    637         # loop detection
    638         # .redirect_dict has a key url if url was previously visited.
    639         if hasattr(req, 'redirect_dict'):
    640             visited = new.redirect_dict = req.redirect_dict
    641             if (visited.get(newurl, 0) >= self.max_repeats or
    642                 len(visited) >= self.max_redirections):
    643                 raise HTTPError(req.get_full_url(), code,
    644                                 self.inf_msg + msg, headers, fp)
    645         else:
    646             visited = new.redirect_dict = req.redirect_dict = {}
    647         visited[newurl] = visited.get(newurl, 0) + 1
    648 
    649         # Don't close the fp until we are sure that we won't use it
    650         # with HTTPError.
    651         fp.read()
    652         fp.close()
    653 
    654         return self.parent.open(new, timeout=req.timeout)
    655 
    656     http_error_301 = http_error_303 = http_error_307 = http_error_302
    657 
    658     inf_msg = "The HTTP server returned a redirect error that would " \
    659               "lead to an infinite loop.\n" \
    660               "The last 30x error message was:\n"
    661 
    662 
    663 def _parse_proxy(proxy):
    664     """Return (scheme, user, password, host/port) given a URL or an authority.
    665 
    666     If a URL is supplied, it must have an authority (host:port) component.
    667     According to RFC 3986, having an authority component means the URL must
    668     have two slashes after the scheme:
    669 
    670     >>> _parse_proxy('file:/ftp.example.com/')
    671     Traceback (most recent call last):
    672     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    673 
    674     The first three items of the returned tuple may be None.
    675 
    676     Examples of authority parsing:
    677 
    678     >>> _parse_proxy('proxy.example.com')
    679     (None, None, None, 'proxy.example.com')
    680     >>> _parse_proxy('proxy.example.com:3128')
    681     (None, None, None, 'proxy.example.com:3128')
    682 
    683     The authority component may optionally include userinfo (assumed to be
    684     username:password):
    685 
    686     >>> _parse_proxy('joe:password (at] proxy.example.com')
    687     (None, 'joe', 'password', 'proxy.example.com')
    688     >>> _parse_proxy('joe:password (at] proxy.example.com:3128')
    689     (None, 'joe', 'password', 'proxy.example.com:3128')
    690 
    691     Same examples, but with URLs instead:
    692 
    693     >>> _parse_proxy('http://proxy.example.com/')
    694     ('http', None, None, 'proxy.example.com')
    695     >>> _parse_proxy('http://proxy.example.com:3128/')
    696     ('http', None, None, 'proxy.example.com:3128')
    697     >>> _parse_proxy('http://joe:password@proxy.example.com/')
    698     ('http', 'joe', 'password', 'proxy.example.com')
    699     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    700     ('http', 'joe', 'password', 'proxy.example.com:3128')
    701 
    702     Everything after the authority is ignored:
    703 
    704     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    705     ('ftp', 'joe', 'password', 'proxy.example.com')
    706 
    707     Test for no trailing '/' case:
    708 
    709     >>> _parse_proxy('http://joe:password@proxy.example.com')
    710     ('http', 'joe', 'password', 'proxy.example.com')
    711 
    712     """
    713     scheme, r_scheme = splittype(proxy)
    714     if not r_scheme.startswith("/"):
    715         # authority
    716         scheme = None
    717         authority = proxy
    718     else:
    719         # URL
    720         if not r_scheme.startswith("//"):
    721             raise ValueError("proxy URL with no authority: %r" % proxy)
    722         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
    723         # and 3.3.), path is empty or starts with '/'
    724         end = r_scheme.find("/", 2)
    725         if end == -1:
    726             end = None
    727         authority = r_scheme[2:end]
    728     userinfo, hostport = splituser(authority)
    729     if userinfo is not None:
    730         user, password = splitpasswd(userinfo)
    731     else:
    732         user = password = None
    733     return scheme, user, password, hostport
    734 
    735 class ProxyHandler(BaseHandler):
    736     # Proxies must be in front
    737     handler_order = 100
    738 
    739     def __init__(self, proxies=None):
    740         if proxies is None:
    741             proxies = getproxies()
    742         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
    743         self.proxies = proxies
    744         for type, url in proxies.items():
    745             setattr(self, '%s_open' % type,
    746                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
    747                     meth(r, proxy, type))
    748 
    749     def proxy_open(self, req, proxy, type):
    750         orig_type = req.get_type()
    751         proxy_type, user, password, hostport = _parse_proxy(proxy)
    752 
    753         if proxy_type is None:
    754             proxy_type = orig_type
    755 
    756         if req.host and proxy_bypass(req.host):
    757             return None
    758 
    759         if user and password:
    760             user_pass = '%s:%s' % (unquote(user), unquote(password))
    761             creds = base64.b64encode(user_pass).strip()
    762             req.add_header('Proxy-authorization', 'Basic ' + creds)
    763         hostport = unquote(hostport)
    764         req.set_proxy(hostport, proxy_type)
    765 
    766         if orig_type == proxy_type or orig_type == 'https':
    767             # let other handlers take care of it
    768             return None
    769         else:
    770             # need to start over, because the other handlers don't
    771             # grok the proxy's URL type
    772             # e.g. if we have a constructor arg proxies like so:
    773             # {'http': 'ftp://proxy.example.com'}, we may end up turning
    774             # a request for http://acme.example.com/a into one for
    775             # ftp://proxy.example.com/a
    776             return self.parent.open(req, timeout=req.timeout)
    777 
    778 class HTTPPasswordMgr:
    779 
    780     def __init__(self):
    781         self.passwd = {}
    782 
    783     def add_password(self, realm, uri, user, passwd):
    784         # uri could be a single URI or a sequence
    785         if isinstance(uri, basestring):
    786             uri = [uri]
    787         if not realm in self.passwd:
    788             self.passwd[realm] = {}
    789         for default_port in True, False:
    790             reduced_uri = tuple(
    791                 [self.reduce_uri(u, default_port) for u in uri])
    792             self.passwd[realm][reduced_uri] = (user, passwd)
    793 
    794     def find_user_password(self, realm, authuri):
    795         domains = self.passwd.get(realm, {})
    796         for default_port in True, False:
    797             reduced_authuri = self.reduce_uri(authuri, default_port)
    798             for uris, authinfo in domains.iteritems():
    799                 for uri in uris:
    800                     if self.is_suburi(uri, reduced_authuri):
    801                         return authinfo
    802         return None, None
    803 
    804     def reduce_uri(self, uri, default_port=True):
    805         """Accept authority or URI and extract only the authority and path."""
    806         # note HTTP URLs do not have a userinfo component
    807         parts = urlparse.urlsplit(uri)
    808         if parts[1]:
    809             # URI
    810             scheme = parts[0]
    811             authority = parts[1]
    812             path = parts[2] or '/'
    813         else:
    814             # host or host:port
    815             scheme = None
    816             authority = uri
    817             path = '/'
    818         host, port = splitport(authority)
    819         if default_port and port is None and scheme is not None:
    820             dport = {"http": 80,
    821                      "https": 443,
    822                      }.get(scheme)
    823             if dport is not None:
    824                 authority = "%s:%d" % (host, dport)
    825         return authority, path
    826 
    827     def is_suburi(self, base, test):
    828         """Check if test is below base in a URI tree
    829 
    830         Both args must be URIs in reduced form.
    831         """
    832         if base == test:
    833             return True
    834         if base[0] != test[0]:
    835             return False
    836         common = posixpath.commonprefix((base[1], test[1]))
    837         if len(common) == len(base[1]):
    838             return True
    839         return False
    840 
    841 
    842 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
    843 
    844     def find_user_password(self, realm, authuri):
    845         user, password = HTTPPasswordMgr.find_user_password(self, realm,
    846                                                             authuri)
    847         if user is not None:
    848             return user, password
    849         return HTTPPasswordMgr.find_user_password(self, None, authuri)
    850 
    851 
    852 class AbstractBasicAuthHandler:
    853 
    854     # XXX this allows for multiple auth-schemes, but will stupidly pick
    855     # the last one with a realm specified.
    856 
    857     # allow for double- and single-quoted realm values
    858     # (single quotes are a violation of the RFC, but appear in the wild)
    859     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
    860                     'realm=(["\']?)([^"\']*)\\2', re.I)
    861 
    862     # XXX could pre-emptively send auth info already accepted (RFC 2617,
    863     # end of section 2, and section 1.2 immediately after "credentials"
    864     # production).
    865 
    866     def __init__(self, password_mgr=None):
    867         if password_mgr is None:
    868             password_mgr = HTTPPasswordMgr()
    869         self.passwd = password_mgr
    870         self.add_password = self.passwd.add_password
    871 
    872 
    873     def http_error_auth_reqed(self, authreq, host, req, headers):
    874         # host may be an authority (without userinfo) or a URL with an
    875         # authority
    876         # XXX could be multiple headers
    877         authreq = headers.get(authreq, None)
    878 
    879         if authreq:
    880             mo = AbstractBasicAuthHandler.rx.search(authreq)
    881             if mo:
    882                 scheme, quote, realm = mo.groups()
    883                 if quote not in ['"', "'"]:
    884                     warnings.warn("Basic Auth Realm was unquoted",
    885                                   UserWarning, 2)
    886                 if scheme.lower() == 'basic':
    887                     return self.retry_http_basic_auth(host, req, realm)
    888 
    889     def retry_http_basic_auth(self, host, req, realm):
    890         user, pw = self.passwd.find_user_password(realm, host)
    891         if pw is not None:
    892             raw = "%s:%s" % (user, pw)
    893             auth = 'Basic %s' % base64.b64encode(raw).strip()
    894             if req.get_header(self.auth_header, None) == auth:
    895                 return None
    896             req.add_unredirected_header(self.auth_header, auth)
    897             return self.parent.open(req, timeout=req.timeout)
    898         else:
    899             return None
    900 
    901 
    902 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    903 
    904     auth_header = 'Authorization'
    905 
    906     def http_error_401(self, req, fp, code, msg, headers):
    907         url = req.get_full_url()
    908         response = self.http_error_auth_reqed('www-authenticate',
    909                                               url, req, headers)
    910         return response
    911 
    912 
    913 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
    914 
    915     auth_header = 'Proxy-authorization'
    916 
    917     def http_error_407(self, req, fp, code, msg, headers):
    918         # http_error_auth_reqed requires that there is no userinfo component in
    919         # authority.  Assume there isn't one, since urllib2 does not (and
    920         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
    921         # userinfo.
    922         authority = req.get_host()
    923         response = self.http_error_auth_reqed('proxy-authenticate',
    924                                           authority, req, headers)
    925         return response
    926 
    927 
    928 def randombytes(n):
    929     """Return n random bytes."""
    930     # Use /dev/urandom if it is available.  Fall back to random module
    931     # if not.  It might be worthwhile to extend this function to use
    932     # other platform-specific mechanisms for getting random bytes.
    933     if os.path.exists("/dev/urandom"):
    934         f = open("/dev/urandom")
    935         s = f.read(n)
    936         f.close()
    937         return s
    938     else:
    939         L = [chr(random.randrange(0, 256)) for i in range(n)]
    940         return "".join(L)
    941 
    942 class AbstractDigestAuthHandler:
    943     # Digest authentication is specified in RFC 2617.
    944 
    945     # XXX The client does not inspect the Authentication-Info header
    946     # in a successful response.
    947 
    948     # XXX It should be possible to test this implementation against
    949     # a mock server that just generates a static set of challenges.
    950 
    951     # XXX qop="auth-int" supports is shaky
    952 
    953     def __init__(self, passwd=None):
    954         if passwd is None:
    955             passwd = HTTPPasswordMgr()
    956         self.passwd = passwd
    957         self.add_password = self.passwd.add_password
    958         self.retried = 0
    959         self.nonce_count = 0
    960         self.last_nonce = None
    961 
    962     def reset_retry_count(self):
    963         self.retried = 0
    964 
    965     def http_error_auth_reqed(self, auth_header, host, req, headers):
    966         authreq = headers.get(auth_header, None)
    967         if self.retried > 5:
    968             # Don't fail endlessly - if we failed once, we'll probably
    969             # fail a second time. Hm. Unless the Password Manager is
    970             # prompting for the information. Crap. This isn't great
    971             # but it's better than the current 'repeat until recursion
    972             # depth exceeded' approach <wink>
    973             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
    974                             headers, None)
    975         else:
    976             self.retried += 1
    977         if authreq:
    978             scheme = authreq.split()[0]
    979             if scheme.lower() == 'digest':
    980                 return self.retry_http_digest_auth(req, authreq)
    981 
    982     def retry_http_digest_auth(self, req, auth):
    983         token, challenge = auth.split(' ', 1)
    984         chal = parse_keqv_list(parse_http_list(challenge))
    985         auth = self.get_authorization(req, chal)
    986         if auth:
    987             auth_val = 'Digest %s' % auth
    988             if req.headers.get(self.auth_header, None) == auth_val:
    989                 return None
    990             req.add_unredirected_header(self.auth_header, auth_val)
    991             resp = self.parent.open(req, timeout=req.timeout)
    992             return resp
    993 
    994     def get_cnonce(self, nonce):
    995         # The cnonce-value is an opaque
    996         # quoted string value provided by the client and used by both client
    997         # and server to avoid chosen plaintext attacks, to provide mutual
    998         # authentication, and to provide some message integrity protection.
    999         # This isn't a fabulous effort, but it's probably Good Enough.
   1000         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
   1001                                             randombytes(8))).hexdigest()
   1002         return dig[:16]
   1003 
   1004     def get_authorization(self, req, chal):
   1005         try:
   1006             realm = chal['realm']
   1007             nonce = chal['nonce']
   1008             qop = chal.get('qop')
   1009             algorithm = chal.get('algorithm', 'MD5')
   1010             # mod_digest doesn't send an opaque, even though it isn't
   1011             # supposed to be optional
   1012             opaque = chal.get('opaque', None)
   1013         except KeyError:
   1014             return None
   1015 
   1016         H, KD = self.get_algorithm_impls(algorithm)
   1017         if H is None:
   1018             return None
   1019 
   1020         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
   1021         if user is None:
   1022             return None
   1023 
   1024         # XXX not implemented yet
   1025         if req.has_data():
   1026             entdig = self.get_entity_digest(req.get_data(), chal)
   1027         else:
   1028             entdig = None
   1029 
   1030         A1 = "%s:%s:%s" % (user, realm, pw)
   1031         A2 = "%s:%s" % (req.get_method(),
   1032                         # XXX selector: what about proxies and full urls
   1033                         req.get_selector())
   1034         if qop == 'auth':
   1035             if nonce == self.last_nonce:
   1036                 self.nonce_count += 1
   1037             else:
   1038                 self.nonce_count = 1
   1039                 self.last_nonce = nonce
   1040 
   1041             ncvalue = '%08x' % self.nonce_count
   1042             cnonce = self.get_cnonce(nonce)
   1043             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
   1044             respdig = KD(H(A1), noncebit)
   1045         elif qop is None:
   1046             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
   1047         else:
   1048             # XXX handle auth-int.
   1049             raise URLError("qop '%s' is not supported." % qop)
   1050 
   1051         # XXX should the partial digests be encoded too?
   1052 
   1053         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
   1054                'response="%s"' % (user, realm, nonce, req.get_selector(),
   1055                                   respdig)
   1056         if opaque:
   1057             base += ', opaque="%s"' % opaque
   1058         if entdig:
   1059             base += ', digest="%s"' % entdig
   1060         base += ', algorithm="%s"' % algorithm
   1061         if qop:
   1062             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
   1063         return base
   1064 
   1065     def get_algorithm_impls(self, algorithm):
   1066         # algorithm should be case-insensitive according to RFC2617
   1067         algorithm = algorithm.upper()
   1068         # lambdas assume digest modules are imported at the top level
   1069         if algorithm == 'MD5':
   1070             H = lambda x: hashlib.md5(x).hexdigest()
   1071         elif algorithm == 'SHA':
   1072             H = lambda x: hashlib.sha1(x).hexdigest()
   1073         # XXX MD5-sess
   1074         else:
   1075             raise ValueError("Unsupported digest authentication "
   1076                              "algorithm %r" % algorithm.lower())
   1077         KD = lambda s, d: H("%s:%s" % (s, d))
   1078         return H, KD
   1079 
   1080     def get_entity_digest(self, data, chal):
   1081         # XXX not implemented yet
   1082         return None
   1083 
   1084 
   1085 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1086     """An authentication protocol defined by RFC 2069
   1087 
   1088     Digest authentication improves on basic authentication because it
   1089     does not transmit passwords in the clear.
   1090     """
   1091 
   1092     auth_header = 'Authorization'
   1093     handler_order = 490  # before Basic auth
   1094 
   1095     def http_error_401(self, req, fp, code, msg, headers):
   1096         host = urlparse.urlparse(req.get_full_url())[1]
   1097         retry = self.http_error_auth_reqed('www-authenticate',
   1098                                            host, req, headers)
   1099         self.reset_retry_count()
   1100         return retry
   1101 
   1102 
   1103 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
   1104 
   1105     auth_header = 'Proxy-Authorization'
   1106     handler_order = 490  # before Basic auth
   1107 
   1108     def http_error_407(self, req, fp, code, msg, headers):
   1109         host = req.get_host()
   1110         retry = self.http_error_auth_reqed('proxy-authenticate',
   1111                                            host, req, headers)
   1112         self.reset_retry_count()
   1113         return retry
   1114 
   1115 class AbstractHTTPHandler(BaseHandler):
   1116 
   1117     def __init__(self, debuglevel=0):
   1118         self._debuglevel = debuglevel
   1119 
   1120     def set_http_debuglevel(self, level):
   1121         self._debuglevel = level
   1122 
   1123     def do_request_(self, request):
   1124         host = request.get_host()
   1125         if not host:
   1126             raise URLError('no host given')
   1127 
   1128         if request.has_data():  # POST
   1129             data = request.get_data()
   1130             if not request.has_header('Content-type'):
   1131                 request.add_unredirected_header(
   1132                     'Content-type',
   1133                     'application/x-www-form-urlencoded')
   1134             if not request.has_header('Content-length'):
   1135                 request.add_unredirected_header(
   1136                     'Content-length', '%d' % len(data))
   1137 
   1138         sel_host = host
   1139         if request.has_proxy():
   1140             scheme, sel = splittype(request.get_selector())
   1141             sel_host, sel_path = splithost(sel)
   1142 
   1143         if not request.has_header('Host'):
   1144             request.add_unredirected_header('Host', sel_host)
   1145         for name, value in self.parent.addheaders:
   1146             name = name.capitalize()
   1147             if not request.has_header(name):
   1148                 request.add_unredirected_header(name, value)
   1149 
   1150         return request
   1151 
   1152     def do_open(self, http_class, req, **http_conn_args):
   1153         """Return an addinfourl object for the request, using http_class.
   1154 
   1155         http_class must implement the HTTPConnection API from httplib.
   1156         The addinfourl return value is a file-like object.  It also
   1157         has methods and attributes including:
   1158             - info(): return a mimetools.Message object for the headers
   1159             - geturl(): return the original request URL
   1160             - code: HTTP status code
   1161         """
   1162         host = req.get_host()
   1163         if not host:
   1164             raise URLError('no host given')
   1165 
   1166         # will parse host:port
   1167         h = http_class(host, timeout=req.timeout, **http_conn_args)
   1168         h.set_debuglevel(self._debuglevel)
   1169 
   1170         headers = dict(req.unredirected_hdrs)
   1171         headers.update(dict((k, v) for k, v in req.headers.items()
   1172                             if k not in headers))
   1173 
   1174         # We want to make an HTTP/1.1 request, but the addinfourl
   1175         # class isn't prepared to deal with a persistent connection.
   1176         # It will try to read all remaining data from the socket,
   1177         # which will block while the server waits for the next request.
   1178         # So make sure the connection gets closed after the (only)
   1179         # request.
   1180         headers["Connection"] = "close"
   1181         headers = dict(
   1182             (name.title(), val) for name, val in headers.items())
   1183 
   1184         if req._tunnel_host:
   1185             tunnel_headers = {}
   1186             proxy_auth_hdr = "Proxy-Authorization"
   1187             if proxy_auth_hdr in headers:
   1188                 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
   1189                 # Proxy-Authorization should not be sent to origin
   1190                 # server.
   1191                 del headers[proxy_auth_hdr]
   1192             h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
   1193 
   1194         try:
   1195             h.request(req.get_method(), req.get_selector(), req.data, headers)
   1196         except socket.error, err: # XXX what error?
   1197             h.close()
   1198             raise URLError(err)
   1199         else:
   1200             try:
   1201                 r = h.getresponse(buffering=True)
   1202             except TypeError: # buffering kw not supported
   1203                 r = h.getresponse()
   1204 
   1205         # Pick apart the HTTPResponse object to get the addinfourl
   1206         # object initialized properly.
   1207 
   1208         # Wrap the HTTPResponse object in socket's file object adapter
   1209         # for Windows.  That adapter calls recv(), so delegate recv()
   1210         # to read().  This weird wrapping allows the returned object to
   1211         # have readline() and readlines() methods.
   1212 
   1213         # XXX It might be better to extract the read buffering code
   1214         # out of socket._fileobject() and into a base class.
   1215 
   1216         r.recv = r.read
   1217         fp = socket._fileobject(r, close=True)
   1218 
   1219         resp = addinfourl(fp, r.msg, req.get_full_url())
   1220         resp.code = r.status
   1221         resp.msg = r.reason
   1222         return resp
   1223 
   1224 
   1225 class HTTPHandler(AbstractHTTPHandler):
   1226 
   1227     def http_open(self, req):
   1228         return self.do_open(httplib.HTTPConnection, req)
   1229 
   1230     http_request = AbstractHTTPHandler.do_request_
   1231 
   1232 if hasattr(httplib, 'HTTPS'):
   1233     class HTTPSHandler(AbstractHTTPHandler):
   1234 
   1235         def __init__(self, debuglevel=0, context=None):
   1236             AbstractHTTPHandler.__init__(self, debuglevel)
   1237             self._context = context
   1238 
   1239         def https_open(self, req):
   1240             return self.do_open(httplib.HTTPSConnection, req,
   1241                 context=self._context)
   1242 
   1243         https_request = AbstractHTTPHandler.do_request_
   1244 
   1245 class HTTPCookieProcessor(BaseHandler):
   1246     def __init__(self, cookiejar=None):
   1247         import cookielib
   1248         if cookiejar is None:
   1249             cookiejar = cookielib.CookieJar()
   1250         self.cookiejar = cookiejar
   1251 
   1252     def http_request(self, request):
   1253         self.cookiejar.add_cookie_header(request)
   1254         return request
   1255 
   1256     def http_response(self, request, response):
   1257         self.cookiejar.extract_cookies(response, request)
   1258         return response
   1259 
   1260     https_request = http_request
   1261     https_response = http_response
   1262 
   1263 class UnknownHandler(BaseHandler):
   1264     def unknown_open(self, req):
   1265         type = req.get_type()
   1266         raise URLError('unknown url type: %s' % type)
   1267 
   1268 def parse_keqv_list(l):
   1269     """Parse list of key=value strings where keys are not duplicated."""
   1270     parsed = {}
   1271     for elt in l:
   1272         k, v = elt.split('=', 1)
   1273         if v[0] == '"' and v[-1] == '"':
   1274             v = v[1:-1]
   1275         parsed[k] = v
   1276     return parsed
   1277 
   1278 def parse_http_list(s):
   1279     """Parse lists as described by RFC 2068 Section 2.
   1280 
   1281     In particular, parse comma-separated lists where the elements of
   1282     the list may include quoted-strings.  A quoted-string could
   1283     contain a comma.  A non-quoted string could have quotes in the
   1284     middle.  Neither commas nor quotes count if they are escaped.
   1285     Only double-quotes count, not single-quotes.
   1286     """
   1287     res = []
   1288     part = ''
   1289 
   1290     escape = quote = False
   1291     for cur in s:
   1292         if escape:
   1293             part += cur
   1294             escape = False
   1295             continue
   1296         if quote:
   1297             if cur == '\\':
   1298                 escape = True
   1299                 continue
   1300             elif cur == '"':
   1301                 quote = False
   1302             part += cur
   1303             continue
   1304 
   1305         if cur == ',':
   1306             res.append(part)
   1307             part = ''
   1308             continue
   1309 
   1310         if cur == '"':
   1311             quote = True
   1312 
   1313         part += cur
   1314 
   1315     # append last part
   1316     if part:
   1317         res.append(part)
   1318 
   1319     return [part.strip() for part in res]
   1320 
   1321 def _safe_gethostbyname(host):
   1322     try:
   1323         return socket.gethostbyname(host)
   1324     except socket.gaierror:
   1325         return None
   1326 
   1327 class FileHandler(BaseHandler):
   1328     # Use local file or FTP depending on form of URL
   1329     def file_open(self, req):
   1330         url = req.get_selector()
   1331         if url[:2] == '//' and url[2:3] != '/' and (req.host and
   1332                 req.host != 'localhost'):
   1333             req.type = 'ftp'
   1334             return self.parent.open(req)
   1335         else:
   1336             return self.open_local_file(req)
   1337 
   1338     # names for the localhost
   1339     names = None
   1340     def get_names(self):
   1341         if FileHandler.names is None:
   1342             try:
   1343                 FileHandler.names = tuple(
   1344                     socket.gethostbyname_ex('localhost')[2] +
   1345                     socket.gethostbyname_ex(socket.gethostname())[2])
   1346             except socket.gaierror:
   1347                 FileHandler.names = (socket.gethostbyname('localhost'),)
   1348         return FileHandler.names
   1349 
   1350     # not entirely sure what the rules are here
   1351     def open_local_file(self, req):
   1352         import email.utils
   1353         import mimetypes
   1354         host = req.get_host()
   1355         filename = req.get_selector()
   1356         localfile = url2pathname(filename)
   1357         try:
   1358             stats = os.stat(localfile)
   1359             size = stats.st_size
   1360             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
   1361             mtype = mimetypes.guess_type(filename)[0]
   1362             headers = mimetools.Message(StringIO(
   1363                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
   1364                 (mtype or 'text/plain', size, modified)))
   1365             if host:
   1366                 host, port = splitport(host)
   1367             if not host or \
   1368                 (not port and _safe_gethostbyname(host) in self.get_names()):
   1369                 if host:
   1370                     origurl = 'file://' + host + filename
   1371                 else:
   1372                     origurl = 'file://' + filename
   1373                 return addinfourl(open(localfile, 'rb'), headers, origurl)
   1374         except OSError, msg:
   1375             # urllib2 users shouldn't expect OSErrors coming from urlopen()
   1376             raise URLError(msg)
   1377         raise URLError('file not on local host')
   1378 
   1379 class FTPHandler(BaseHandler):
   1380     def ftp_open(self, req):
   1381         import ftplib
   1382         import mimetypes
   1383         host = req.get_host()
   1384         if not host:
   1385             raise URLError('ftp error: no host given')
   1386         host, port = splitport(host)
   1387         if port is None:
   1388             port = ftplib.FTP_PORT
   1389         else:
   1390             port = int(port)
   1391 
   1392         # username/password handling
   1393         user, host = splituser(host)
   1394         if user:
   1395             user, passwd = splitpasswd(user)
   1396         else:
   1397             passwd = None
   1398         host = unquote(host)
   1399         user = user or ''
   1400         passwd = passwd or ''
   1401 
   1402         try:
   1403             host = socket.gethostbyname(host)
   1404         except socket.error, msg:
   1405             raise URLError(msg)
   1406         path, attrs = splitattr(req.get_selector())
   1407         dirs = path.split('/')
   1408         dirs = map(unquote, dirs)
   1409         dirs, file = dirs[:-1], dirs[-1]
   1410         if dirs and not dirs[0]:
   1411             dirs = dirs[1:]
   1412         try:
   1413             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
   1414             type = file and 'I' or 'D'
   1415             for attr in attrs:
   1416                 attr, value = splitvalue(attr)
   1417                 if attr.lower() == 'type' and \
   1418                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
   1419                     type = value.upper()
   1420             fp, retrlen = fw.retrfile(file, type)
   1421             headers = ""
   1422             mtype = mimetypes.guess_type(req.get_full_url())[0]
   1423             if mtype:
   1424                 headers += "Content-type: %s\n" % mtype
   1425             if retrlen is not None and retrlen >= 0:
   1426                 headers += "Content-length: %d\n" % retrlen
   1427             sf = StringIO(headers)
   1428             headers = mimetools.Message(sf)
   1429             return addinfourl(fp, headers, req.get_full_url())
   1430         except ftplib.all_errors, msg:
   1431             raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
   1432 
   1433     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1434         fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
   1435                         persistent=False)
   1436 ##        fw.ftp.set_debuglevel(1)
   1437         return fw
   1438 
   1439 class CacheFTPHandler(FTPHandler):
   1440     # XXX would be nice to have pluggable cache strategies
   1441     # XXX this stuff is definitely not thread safe
   1442     def __init__(self):
   1443         self.cache = {}
   1444         self.timeout = {}
   1445         self.soonest = 0
   1446         self.delay = 60
   1447         self.max_conns = 16
   1448 
   1449     def setTimeout(self, t):
   1450         self.delay = t
   1451 
   1452     def setMaxConns(self, m):
   1453         self.max_conns = m
   1454 
   1455     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
   1456         key = user, host, port, '/'.join(dirs), timeout
   1457         if key in self.cache:
   1458             self.timeout[key] = time.time() + self.delay
   1459         else:
   1460             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
   1461             self.timeout[key] = time.time() + self.delay
   1462         self.check_cache()
   1463         return self.cache[key]
   1464 
   1465     def check_cache(self):
   1466         # first check for old ones
   1467         t = time.time()
   1468         if self.soonest <= t:
   1469             for k, v in self.timeout.items():
   1470                 if v < t:
   1471                     self.cache[k].close()
   1472                     del self.cache[k]
   1473                     del self.timeout[k]
   1474         self.soonest = min(self.timeout.values())
   1475 
   1476         # then check the size
   1477         if len(self.cache) == self.max_conns:
   1478             for k, v in self.timeout.items():
   1479                 if v == self.soonest:
   1480                     del self.cache[k]
   1481                     del self.timeout[k]
   1482                     break
   1483             self.soonest = min(self.timeout.values())
   1484 
   1485     def clear_cache(self):
   1486         for conn in self.cache.values():
   1487             conn.close()
   1488         self.cache.clear()
   1489         self.timeout.clear()
   1490