Home | History | Annotate | Download | only in fancy_urllib
      1 #!/usr/bin/env python
      2 #
      3 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007 Python Software
      4 # Foundation; All Rights Reserved
      5 
      6 """A HTTPSConnection/Handler with additional proxy and cert validation features.
      7 
      8 In particular, monkey patches in Python r74203 to provide support for CONNECT
      9 proxies and adds SSL cert validation if the ssl module is present.
     10 """
     11 
     12 __author__ = "{frew,nick.johnson}@google.com (Fred Wulff and Nick Johnson)"
     13 
     14 import base64
     15 import httplib
     16 import logging
     17 import re
     18 import socket
     19 import urllib2
     20 
     21 from urllib import splittype
     22 from urllib import splituser
     23 from urllib import splitpasswd
     24 
     25 class InvalidCertificateException(httplib.HTTPException):
     26   """Raised when a certificate is provided with an invalid hostname."""
     27 
     28   def __init__(self, host, cert, reason):
     29     """Constructor.
     30 
     31     Args:
     32       host: The hostname the connection was made to.
     33       cert: The SSL certificate (as a dictionary) the host returned.
     34     """
     35     httplib.HTTPException.__init__(self)
     36     self.host = host
     37     self.cert = cert
     38     self.reason = reason
     39 
     40   def __str__(self):
     41     return ('Host %s returned an invalid certificate (%s): %s\n'
     42             'To learn more, see '
     43             'http://code.google.com/appengine/kb/general.html#rpcssl' %
     44             (self.host, self.reason, self.cert))
     45 
     46 def can_validate_certs():
     47   """Return True if we have the SSL package and can validate certificates."""
     48   try:
     49     import ssl
     50     return True
     51   except ImportError:
     52     return False
     53 
     54 def _create_fancy_connection(tunnel_host=None, key_file=None,
     55                              cert_file=None, ca_certs=None):
     56   # This abomination brought to you by the fact that
     57   # the HTTPHandler creates the connection instance in the middle
     58   # of do_open so we need to add the tunnel host to the class.
     59 
     60   class PresetProxyHTTPSConnection(httplib.HTTPSConnection):
     61     """An HTTPS connection that uses a proxy defined by the enclosing scope."""
     62 
     63     def __init__(self, *args, **kwargs):
     64       httplib.HTTPSConnection.__init__(self, *args, **kwargs)
     65 
     66       self._tunnel_host = tunnel_host
     67       if tunnel_host:
     68         logging.debug("Creating preset proxy https conn: %s", tunnel_host)
     69 
     70       self.key_file = key_file
     71       self.cert_file = cert_file
     72       self.ca_certs = ca_certs
     73       try:
     74         import ssl
     75         if self.ca_certs:
     76           self.cert_reqs = ssl.CERT_REQUIRED
     77         else:
     78           self.cert_reqs = ssl.CERT_NONE
     79       except ImportError:
     80         pass
     81 
     82     def _tunnel(self):
     83       self._set_hostport(self._tunnel_host, None)
     84       logging.info("Connecting through tunnel to: %s:%d",
     85                    self.host, self.port)
     86       self.send("CONNECT %s:%d HTTP/1.0\r\n\r\n" % (self.host, self.port))
     87       response = self.response_class(self.sock, strict=self.strict,
     88                                      method=self._method)
     89       (_, code, message) = response._read_status()
     90 
     91       if code != 200:
     92         self.close()
     93         raise socket.error, "Tunnel connection failed: %d %s" % (
     94             code, message.strip())
     95 
     96       while True:
     97         line = response.fp.readline()
     98         if line == "\r\n":
     99           break
    100 
    101     def _get_valid_hosts_for_cert(self, cert):
    102       """Returns a list of valid host globs for an SSL certificate.
    103 
    104       Args:
    105         cert: A dictionary representing an SSL certificate.
    106       Returns:
    107         list: A list of valid host globs.
    108       """
    109       if 'subjectAltName' in cert:
    110         return [x[1] for x in cert['subjectAltName'] if x[0].lower() == 'dns']
    111       else:
    112         # Return a list of commonName fields
    113         return [x[0][1] for x in cert['subject']
    114                 if x[0][0].lower() == 'commonname']
    115 
    116     def _validate_certificate_hostname(self, cert, hostname):
    117       """Validates that a given hostname is valid for an SSL certificate.
    118 
    119       Args:
    120         cert: A dictionary representing an SSL certificate.
    121         hostname: The hostname to test.
    122       Returns:
    123         bool: Whether or not the hostname is valid for this certificate.
    124       """
    125       hosts = self._get_valid_hosts_for_cert(cert)
    126       for host in hosts:
    127         # Convert the glob-style hostname expression (eg, '*.google.com') into a
    128         # valid regular expression.
    129         host_re = host.replace('.', '\.').replace('*', '[^.]*')
    130         if re.search('^%s$' % (host_re,), hostname, re.I):
    131           return True
    132       return False
    133 
    134 
    135     def connect(self):
    136       # TODO(frew): When we drop support for <2.6 (in the far distant future),
    137       # change this to socket.create_connection.
    138       self.sock = _create_connection((self.host, self.port))
    139 
    140       if self._tunnel_host:
    141         self._tunnel()
    142 
    143       # ssl and FakeSocket got deprecated. Try for the new hotness of wrap_ssl,
    144       # with fallback.
    145       try:
    146         import ssl
    147         self.sock = ssl.wrap_socket(self.sock,
    148                                     keyfile=self.key_file,
    149                                     certfile=self.cert_file,
    150                                     ca_certs=self.ca_certs,
    151                                     cert_reqs=self.cert_reqs)
    152 
    153         if self.cert_reqs & ssl.CERT_REQUIRED:
    154           cert = self.sock.getpeercert()
    155           hostname = self.host.split(':', 0)[0]
    156           if not self._validate_certificate_hostname(cert, hostname):
    157             raise InvalidCertificateException(hostname, cert,
    158                                               'hostname mismatch')
    159       except ImportError:
    160         ssl = socket.ssl(self.sock,
    161                          keyfile=self.key_file,
    162                          certfile=self.cert_file)
    163         self.sock = httplib.FakeSocket(self.sock, ssl)
    164 
    165   return PresetProxyHTTPSConnection
    166 
    167 
    168 # Here to end of _create_connection copied wholesale from Python 2.6"s socket.py
    169 _GLOBAL_DEFAULT_TIMEOUT = object()
    170 
    171 
    172 def _create_connection(address, timeout=_GLOBAL_DEFAULT_TIMEOUT):
    173   """Connect to *address* and return the socket object.
    174 
    175   Convenience function.  Connect to *address* (a 2-tuple ``(host,
    176   port)``) and return the socket object.  Passing the optional
    177   *timeout* parameter will set the timeout on the socket instance
    178   before attempting to connect.  If no *timeout* is supplied, the
    179   global default timeout setting returned by :func:`getdefaulttimeout`
    180   is used.
    181   """
    182 
    183   msg = "getaddrinfo returns an empty list"
    184   host, port = address
    185   for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
    186     af, socktype, proto, canonname, sa = res
    187     sock = None
    188     try:
    189       sock = socket.socket(af, socktype, proto)
    190       if timeout is not _GLOBAL_DEFAULT_TIMEOUT:
    191         sock.settimeout(timeout)
    192       sock.connect(sa)
    193       return sock
    194 
    195     except socket.error, msg:
    196       if sock is not None:
    197         sock.close()
    198 
    199   raise socket.error, msg
    200 
    201 
    202 class FancyRequest(urllib2.Request):
    203   """A request that allows the use of a CONNECT proxy."""
    204 
    205   def __init__(self, *args, **kwargs):
    206     urllib2.Request.__init__(self, *args, **kwargs)
    207     self._tunnel_host = None
    208     self._key_file = None
    209     self._cert_file = None
    210     self._ca_certs = None
    211 
    212   def set_proxy(self, host, type):
    213     saved_type = None
    214 
    215     if self.get_type() == "https" and not self._tunnel_host:
    216       self._tunnel_host = self.get_host()
    217       saved_type = self.get_type()
    218     urllib2.Request.set_proxy(self, host, type)
    219 
    220     if saved_type:
    221       # Don't set self.type, we want to preserve the
    222       # type for tunneling.
    223       self.type = saved_type
    224 
    225   def set_ssl_info(self, key_file=None, cert_file=None, ca_certs=None):
    226     self._key_file = key_file
    227     self._cert_file = cert_file
    228     self._ca_certs = ca_certs
    229 
    230 
    231 class FancyProxyHandler(urllib2.ProxyHandler):
    232   """A ProxyHandler that works with CONNECT-enabled proxies."""
    233 
    234   # Taken verbatim from /usr/lib/python2.5/urllib2.py
    235   def _parse_proxy(self, proxy):
    236     """Return (scheme, user, password, host/port) given a URL or an authority.
    237 
    238     If a URL is supplied, it must have an authority (host:port) component.
    239     According to RFC 3986, having an authority component means the URL must
    240     have two slashes after the scheme:
    241 
    242     >>> _parse_proxy('file:/ftp.example.com/')
    243     Traceback (most recent call last):
    244     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    245 
    246     The first three items of the returned tuple may be None.
    247 
    248     Examples of authority parsing:
    249 
    250     >>> _parse_proxy('proxy.example.com')
    251     (None, None, None, 'proxy.example.com')
    252     >>> _parse_proxy('proxy.example.com:3128')
    253     (None, None, None, 'proxy.example.com:3128')
    254 
    255     The authority component may optionally include userinfo (assumed to be
    256     username:password):
    257 
    258     >>> _parse_proxy('joe:password (at] proxy.example.com')
    259     (None, 'joe', 'password', 'proxy.example.com')
    260     >>> _parse_proxy('joe:password (at] proxy.example.com:3128')
    261     (None, 'joe', 'password', 'proxy.example.com:3128')
    262 
    263     Same examples, but with URLs instead:
    264 
    265     >>> _parse_proxy('http://proxy.example.com/')
    266     ('http', None, None, 'proxy.example.com')
    267     >>> _parse_proxy('http://proxy.example.com:3128/')
    268     ('http', None, None, 'proxy.example.com:3128')
    269     >>> _parse_proxy('http://joe:password@proxy.example.com/')
    270     ('http', 'joe', 'password', 'proxy.example.com')
    271     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    272     ('http', 'joe', 'password', 'proxy.example.com:3128')
    273 
    274     Everything after the authority is ignored:
    275 
    276     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    277     ('ftp', 'joe', 'password', 'proxy.example.com')
    278 
    279     Test for no trailing '/' case:
    280 
    281     >>> _parse_proxy('http://joe:password@proxy.example.com')
    282     ('http', 'joe', 'password', 'proxy.example.com')
    283 
    284     """
    285     scheme, r_scheme = splittype(proxy)
    286     if not r_scheme.startswith("/"):
    287       # authority
    288       scheme = None
    289       authority = proxy
    290     else:
    291       # URL
    292       if not r_scheme.startswith("//"):
    293         raise ValueError("proxy URL with no authority: %r" % proxy)
    294       # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
    295       # and 3.3.), path is empty or starts with '/'
    296       end = r_scheme.find("/", 2)
    297       if end == -1:
    298         end = None
    299       authority = r_scheme[2:end]
    300     userinfo, hostport = splituser(authority)
    301     if userinfo is not None:
    302       user, password = splitpasswd(userinfo)
    303     else:
    304       user = password = None
    305     return scheme, user, password, hostport
    306 
    307   def proxy_open(self, req, proxy, type):
    308     # This block is copied wholesale from Python2.6 urllib2.
    309     # It is idempotent, so the superclass method call executes as normal
    310     # if invoked.
    311     orig_type = req.get_type()
    312     proxy_type, user, password, hostport = self._parse_proxy(proxy)
    313     if proxy_type is None:
    314       proxy_type = orig_type
    315     if user and password:
    316       user_pass = "%s:%s" % (urllib2.unquote(user), urllib2.unquote(password))
    317       creds = base64.b64encode(user_pass).strip()
    318       # Later calls overwrite earlier calls for the same header
    319       req.add_header("Proxy-authorization", "Basic " + creds)
    320     hostport = urllib2.unquote(hostport)
    321     req.set_proxy(hostport, proxy_type)
    322     # This condition is the change
    323     if orig_type == "https":
    324       return None
    325 
    326     return urllib2.ProxyHandler.proxy_open(self, req, proxy, type)
    327 
    328 
    329 class FancyHTTPSHandler(urllib2.HTTPSHandler):
    330   """An HTTPSHandler that works with CONNECT-enabled proxies."""
    331 
    332   def do_open(self, http_class, req):
    333     # Intentionally very specific so as to opt for false negatives
    334     # rather than false positives.
    335     try:
    336       return urllib2.HTTPSHandler.do_open(
    337           self,
    338           _create_fancy_connection(req._tunnel_host,
    339                                    req._key_file,
    340                                    req._cert_file,
    341                                    req._ca_certs),
    342           req)
    343     except urllib2.URLError, url_error:
    344       try:
    345         import ssl
    346         if (type(url_error.reason) == ssl.SSLError and
    347             url_error.reason.args[0] == 1):
    348           # Display the reason to the user. Need to use args for python2.5
    349           # compat.
    350           raise InvalidCertificateException(req.host, '',
    351                                             url_error.reason.args[1])
    352       except ImportError:
    353         pass
    354 
    355       raise url_error
    356 
    357 
    358 # We have to implement this so that we persist the tunneling behavior
    359 # through redirects.
    360 class FancyRedirectHandler(urllib2.HTTPRedirectHandler):
    361   """A redirect handler that persists CONNECT-enabled proxy information."""
    362 
    363   def redirect_request(self, req, *args, **kwargs):
    364     new_req = urllib2.HTTPRedirectHandler.redirect_request(
    365         self, req, *args, **kwargs)
    366     # Same thing as in our set_proxy implementation, but in this case
    367     # we"ve only got a Request to work with, so it was this or copy
    368     # everything over piecemeal.
    369     #
    370     # Note that we do not persist tunneling behavior from an http request
    371     # to an https request, because an http request does not set _tunnel_host.
    372     #
    373     # Also note that in Python < 2.6, you will get an error in
    374     # FancyHTTPSHandler.do_open() on an https urllib2.Request that uses an http
    375     # proxy, since the proxy type will be set to http instead of https.
    376     # (FancyRequest, and urllib2.Request in Python >= 2.6 set the proxy type to
    377     # https.)  Such an urllib2.Request could result from this redirect
    378     # if you are redirecting from an http request (since an an http request
    379     # does not have _tunnel_host set, and thus you will not set the proxy
    380     # in the code below), and if you have defined a proxy for https in, say,
    381     # FancyProxyHandler, and that proxy has type http.
    382     if hasattr(req, "_tunnel_host") and isinstance(new_req, urllib2.Request):
    383       if new_req.get_type() == "https":
    384         if req._tunnel_host:
    385           # req is proxied, so copy the proxy info.
    386           new_req._tunnel_host = new_req.get_host()
    387           new_req.set_proxy(req.host, "https")
    388         else:
    389           # req is not proxied, so just make sure _tunnel_host is defined.
    390           new_req._tunnel_host = None
    391         new_req.type = "https"
    392     if hasattr(req, "_key_file") and isinstance(new_req, urllib2.Request):
    393       # Copy the auxiliary data in case this or any further redirect is https
    394       new_req._key_file = req._key_file
    395       new_req._cert_file = req._cert_file
    396       new_req._ca_certs = req._ca_certs
    397 
    398     return new_req
    399