1 """An extensible library for opening URLs using a variety of protocols 2 3 The simplest way to use this module is to call the urlopen function, 4 which accepts a string containing a URL or a Request object (described 5 below). It opens the URL and returns the results as file-like 6 object; the returned object has some extra methods described below. 7 8 The OpenerDirector manages a collection of Handler objects that do 9 all the actual work. Each Handler implements a particular protocol or 10 option. The OpenerDirector is a composite object that invokes the 11 Handlers needed to open the requested URL. For example, the 12 HTTPHandler performs HTTP GET and POST requests and deals with 13 non-error returns. The HTTPRedirectHandler automatically deals with 14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15 deals with digest authentication. 16 17 urlopen(url, data=None) -- Basic usage is the same as original 18 urllib. pass the url and optionally data to post to an HTTP URL, and 19 get a file-like object back. One difference is that you can also pass 20 a Request instance instead of URL. Raises a URLError (subclass of 21 IOError); for HTTP errors, raises an HTTPError, which can also be 22 treated as a valid response. 23 24 build_opener -- Function that creates a new OpenerDirector instance. 25 Will install the default handlers. Accepts one or more Handlers as 26 arguments, either instances or Handler classes that it will 27 instantiate. If one of the argument is a subclass of the default 28 handler, the argument will be installed instead of the default. 29 30 install_opener -- Installs a new opener as the default opener. 31 32 objects of interest: 33 34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35 the Handler classes, while dealing with requests and responses. 36 37 Request -- An object that encapsulates the state of a request. The 38 state can be as simple as the URL. It can also include extra HTTP 39 headers, e.g. a User-Agent. 40 41 BaseHandler -- 42 43 exceptions: 44 URLError -- A subclass of IOError, individual protocols have their own 45 specific subclass. 46 47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error 48 as an exceptional event or valid response. 49 50 internals: 51 BaseHandler and parent 52 _call_chain conventions 53 54 Example usage: 55 56 import urllib2 57 58 # set up authentication info 59 authinfo = urllib2.HTTPBasicAuthHandler() 60 authinfo.add_password(realm='PDQ Application', 61 uri='https://mahler:8092/site-updates.py', 62 user='klem', 63 passwd='geheim$parole') 64 65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) 66 67 # build a new opener that adds authentication and caching FTP handlers 68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) 69 70 # install it 71 urllib2.install_opener(opener) 72 73 f = urllib2.urlopen('http://www.python.org/') 74 75 76 """ 77 78 # XXX issues: 79 # If an authentication error handler that tries to perform 80 # authentication for some reason but fails, how should the error be 81 # signalled? The client needs to know the HTTP error code. But if 82 # the handler knows that the problem was, e.g., that it didn't know 83 # that hash algo that requested in the challenge, it would be good to 84 # pass that information along to the client, too. 85 # ftp errors aren't handled cleanly 86 # check digest against correct (i.e. non-apache) implementation 87 88 # Possible extensions: 89 # complex proxies XXX not sure what exactly was meant by this 90 # abstract factory for opener 91 92 import base64 93 import hashlib 94 import httplib 95 import mimetools 96 import os 97 import posixpath 98 import random 99 import re 100 import socket 101 import sys 102 import time 103 import urlparse 104 import bisect 105 import warnings 106 107 try: 108 from cStringIO import StringIO 109 except ImportError: 110 from StringIO import StringIO 111 112 from urllib import (unwrap, unquote, splittype, splithost, quote, 113 addinfourl, splitport, splittag, toBytes, 114 splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) 115 116 # support for FileHandler, proxies via environment variables 117 from urllib import localhost, url2pathname, getproxies, proxy_bypass 118 119 # used in User-Agent header sent 120 __version__ = sys.version[:3] 121 122 _opener = None 123 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 124 global _opener 125 if _opener is None: 126 _opener = build_opener() 127 return _opener.open(url, data, timeout) 128 129 def install_opener(opener): 130 global _opener 131 _opener = opener 132 133 # do these error classes make sense? 134 # make sure all of the IOError stuff is overridden. we just want to be 135 # subtypes. 136 137 class URLError(IOError): 138 # URLError is a sub-type of IOError, but it doesn't share any of 139 # the implementation. need to override __init__ and __str__. 140 # It sets self.args for compatibility with other EnvironmentError 141 # subclasses, but args doesn't have the typical format with errno in 142 # slot 0 and strerror in slot 1. This may be better than nothing. 143 def __init__(self, reason): 144 self.args = reason, 145 self.reason = reason 146 147 def __str__(self): 148 return '<urlopen error %s>' % self.reason 149 150 class HTTPError(URLError, addinfourl): 151 """Raised when HTTP error occurs, but also acts like non-error return""" 152 __super_init = addinfourl.__init__ 153 154 def __init__(self, url, code, msg, hdrs, fp): 155 self.code = code 156 self.msg = msg 157 self.hdrs = hdrs 158 self.fp = fp 159 self.filename = url 160 # The addinfourl classes depend on fp being a valid file 161 # object. In some cases, the HTTPError may not have a valid 162 # file object. If this happens, the simplest workaround is to 163 # not initialize the base classes. 164 if fp is not None: 165 self.__super_init(fp, hdrs, url, code) 166 167 def __str__(self): 168 return 'HTTP Error %s: %s' % (self.code, self.msg) 169 170 # since URLError specifies a .reason attribute, HTTPError should also 171 # provide this attribute. See issue13211 fo discussion. 172 @property 173 def reason(self): 174 return self.msg 175 176 def info(self): 177 return self.hdrs 178 179 # copied from cookielib.py 180 _cut_port_re = re.compile(r":\d+$") 181 def request_host(request): 182 """Return request-host, as defined by RFC 2965. 183 184 Variation from RFC: returned value is lowercased, for convenient 185 comparison. 186 187 """ 188 url = request.get_full_url() 189 host = urlparse.urlparse(url)[1] 190 if host == "": 191 host = request.get_header("Host", "") 192 193 # remove port, if present 194 host = _cut_port_re.sub("", host, 1) 195 return host.lower() 196 197 class Request: 198 199 def __init__(self, url, data=None, headers={}, 200 origin_req_host=None, unverifiable=False): 201 # unwrap('<URL:type://host/path>') --> 'type://host/path' 202 self.__original = unwrap(url) 203 self.__original, self.__fragment = splittag(self.__original) 204 self.type = None 205 # self.__r_type is what's left after doing the splittype 206 self.host = None 207 self.port = None 208 self._tunnel_host = None 209 self.data = data 210 self.headers = {} 211 for key, value in headers.items(): 212 self.add_header(key, value) 213 self.unredirected_hdrs = {} 214 if origin_req_host is None: 215 origin_req_host = request_host(self) 216 self.origin_req_host = origin_req_host 217 self.unverifiable = unverifiable 218 219 def __getattr__(self, attr): 220 # XXX this is a fallback mechanism to guard against these 221 # methods getting called in a non-standard order. this may be 222 # too complicated and/or unnecessary. 223 # XXX should the __r_XXX attributes be public? 224 if attr[:12] == '_Request__r_': 225 name = attr[12:] 226 if hasattr(Request, 'get_' + name): 227 getattr(self, 'get_' + name)() 228 return getattr(self, attr) 229 raise AttributeError, attr 230 231 def get_method(self): 232 if self.has_data(): 233 return "POST" 234 else: 235 return "GET" 236 237 # XXX these helper methods are lame 238 239 def add_data(self, data): 240 self.data = data 241 242 def has_data(self): 243 return self.data is not None 244 245 def get_data(self): 246 return self.data 247 248 def get_full_url(self): 249 if self.__fragment: 250 return '%s#%s' % (self.__original, self.__fragment) 251 else: 252 return self.__original 253 254 def get_type(self): 255 if self.type is None: 256 self.type, self.__r_type = splittype(self.__original) 257 if self.type is None: 258 raise ValueError, "unknown url type: %s" % self.__original 259 return self.type 260 261 def get_host(self): 262 if self.host is None: 263 self.host, self.__r_host = splithost(self.__r_type) 264 if self.host: 265 self.host = unquote(self.host) 266 return self.host 267 268 def get_selector(self): 269 return self.__r_host 270 271 def set_proxy(self, host, type): 272 if self.type == 'https' and not self._tunnel_host: 273 self._tunnel_host = self.host 274 else: 275 self.type = type 276 self.__r_host = self.__original 277 278 self.host = host 279 280 def has_proxy(self): 281 return self.__r_host == self.__original 282 283 def get_origin_req_host(self): 284 return self.origin_req_host 285 286 def is_unverifiable(self): 287 return self.unverifiable 288 289 def add_header(self, key, val): 290 # useful for something like authentication 291 self.headers[key.capitalize()] = val 292 293 def add_unredirected_header(self, key, val): 294 # will not be added to a redirected request 295 self.unredirected_hdrs[key.capitalize()] = val 296 297 def has_header(self, header_name): 298 return (header_name in self.headers or 299 header_name in self.unredirected_hdrs) 300 301 def get_header(self, header_name, default=None): 302 return self.headers.get( 303 header_name, 304 self.unredirected_hdrs.get(header_name, default)) 305 306 def header_items(self): 307 hdrs = self.unredirected_hdrs.copy() 308 hdrs.update(self.headers) 309 return hdrs.items() 310 311 class OpenerDirector: 312 def __init__(self): 313 client_version = "Python-urllib/%s" % __version__ 314 self.addheaders = [('User-agent', client_version)] 315 # self.handlers is retained only for backward compatibility 316 self.handlers = [] 317 # manage the individual handlers 318 self.handle_open = {} 319 self.handle_error = {} 320 self.process_response = {} 321 self.process_request = {} 322 323 def add_handler(self, handler): 324 if not hasattr(handler, "add_parent"): 325 raise TypeError("expected BaseHandler instance, got %r" % 326 type(handler)) 327 328 added = False 329 for meth in dir(handler): 330 if meth in ["redirect_request", "do_open", "proxy_open"]: 331 # oops, coincidental match 332 continue 333 334 i = meth.find("_") 335 protocol = meth[:i] 336 condition = meth[i+1:] 337 338 if condition.startswith("error"): 339 j = condition.find("_") + i + 1 340 kind = meth[j+1:] 341 try: 342 kind = int(kind) 343 except ValueError: 344 pass 345 lookup = self.handle_error.get(protocol, {}) 346 self.handle_error[protocol] = lookup 347 elif condition == "open": 348 kind = protocol 349 lookup = self.handle_open 350 elif condition == "response": 351 kind = protocol 352 lookup = self.process_response 353 elif condition == "request": 354 kind = protocol 355 lookup = self.process_request 356 else: 357 continue 358 359 handlers = lookup.setdefault(kind, []) 360 if handlers: 361 bisect.insort(handlers, handler) 362 else: 363 handlers.append(handler) 364 added = True 365 366 if added: 367 bisect.insort(self.handlers, handler) 368 handler.add_parent(self) 369 370 def close(self): 371 # Only exists for backwards compatibility. 372 pass 373 374 def _call_chain(self, chain, kind, meth_name, *args): 375 # Handlers raise an exception if no one else should try to handle 376 # the request, or return None if they can't but another handler 377 # could. Otherwise, they return the response. 378 handlers = chain.get(kind, ()) 379 for handler in handlers: 380 func = getattr(handler, meth_name) 381 382 result = func(*args) 383 if result is not None: 384 return result 385 386 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 387 # accept a URL or a Request object 388 if isinstance(fullurl, basestring): 389 req = Request(fullurl, data) 390 else: 391 req = fullurl 392 if data is not None: 393 req.add_data(data) 394 395 req.timeout = timeout 396 protocol = req.get_type() 397 398 # pre-process request 399 meth_name = protocol+"_request" 400 for processor in self.process_request.get(protocol, []): 401 meth = getattr(processor, meth_name) 402 req = meth(req) 403 404 response = self._open(req, data) 405 406 # post-process response 407 meth_name = protocol+"_response" 408 for processor in self.process_response.get(protocol, []): 409 meth = getattr(processor, meth_name) 410 response = meth(req, response) 411 412 return response 413 414 def _open(self, req, data=None): 415 result = self._call_chain(self.handle_open, 'default', 416 'default_open', req) 417 if result: 418 return result 419 420 protocol = req.get_type() 421 result = self._call_chain(self.handle_open, protocol, protocol + 422 '_open', req) 423 if result: 424 return result 425 426 return self._call_chain(self.handle_open, 'unknown', 427 'unknown_open', req) 428 429 def error(self, proto, *args): 430 if proto in ('http', 'https'): 431 # XXX http[s] protocols are special-cased 432 dict = self.handle_error['http'] # https is not different than http 433 proto = args[2] # YUCK! 434 meth_name = 'http_error_%s' % proto 435 http_err = 1 436 orig_args = args 437 else: 438 dict = self.handle_error 439 meth_name = proto + '_error' 440 http_err = 0 441 args = (dict, proto, meth_name) + args 442 result = self._call_chain(*args) 443 if result: 444 return result 445 446 if http_err: 447 args = (dict, 'default', 'http_error_default') + orig_args 448 return self._call_chain(*args) 449 450 # XXX probably also want an abstract factory that knows when it makes 451 # sense to skip a superclass in favor of a subclass and when it might 452 # make sense to include both 453 454 def build_opener(*handlers): 455 """Create an opener object from a list of handlers. 456 457 The opener will use several default handlers, including support 458 for HTTP, FTP and when applicable, HTTPS. 459 460 If any of the handlers passed as arguments are subclasses of the 461 default handlers, the default handlers will not be used. 462 """ 463 import types 464 def isclass(obj): 465 return isinstance(obj, (types.ClassType, type)) 466 467 opener = OpenerDirector() 468 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 469 HTTPDefaultErrorHandler, HTTPRedirectHandler, 470 FTPHandler, FileHandler, HTTPErrorProcessor] 471 if hasattr(httplib, 'HTTPS'): 472 default_classes.append(HTTPSHandler) 473 skip = set() 474 for klass in default_classes: 475 for check in handlers: 476 if isclass(check): 477 if issubclass(check, klass): 478 skip.add(klass) 479 elif isinstance(check, klass): 480 skip.add(klass) 481 for klass in skip: 482 default_classes.remove(klass) 483 484 for klass in default_classes: 485 opener.add_handler(klass()) 486 487 for h in handlers: 488 if isclass(h): 489 h = h() 490 opener.add_handler(h) 491 return opener 492 493 class BaseHandler: 494 handler_order = 500 495 496 def add_parent(self, parent): 497 self.parent = parent 498 499 def close(self): 500 # Only exists for backwards compatibility 501 pass 502 503 def __lt__(self, other): 504 if not hasattr(other, "handler_order"): 505 # Try to preserve the old behavior of having custom classes 506 # inserted after default ones (works only for custom user 507 # classes which are not aware of handler_order). 508 return True 509 return self.handler_order < other.handler_order 510 511 512 class HTTPErrorProcessor(BaseHandler): 513 """Process HTTP error responses.""" 514 handler_order = 1000 # after all other processing 515 516 def http_response(self, request, response): 517 code, msg, hdrs = response.code, response.msg, response.info() 518 519 # According to RFC 2616, "2xx" code indicates that the client's 520 # request was successfully received, understood, and accepted. 521 if not (200 <= code < 300): 522 response = self.parent.error( 523 'http', request, response, code, msg, hdrs) 524 525 return response 526 527 https_response = http_response 528 529 class HTTPDefaultErrorHandler(BaseHandler): 530 def http_error_default(self, req, fp, code, msg, hdrs): 531 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) 532 533 class HTTPRedirectHandler(BaseHandler): 534 # maximum number of redirections to any single URL 535 # this is needed because of the state that cookies introduce 536 max_repeats = 4 537 # maximum total number of redirections (regardless of URL) before 538 # assuming we're in a loop 539 max_redirections = 10 540 541 def redirect_request(self, req, fp, code, msg, headers, newurl): 542 """Return a Request or None in response to a redirect. 543 544 This is called by the http_error_30x methods when a 545 redirection response is received. If a redirection should 546 take place, return a new Request to allow http_error_30x to 547 perform the redirect. Otherwise, raise HTTPError if no-one 548 else should try to handle this url. Return None if you can't 549 but another Handler might. 550 """ 551 m = req.get_method() 552 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 553 or code in (301, 302, 303) and m == "POST"): 554 # Strictly (according to RFC 2616), 301 or 302 in response 555 # to a POST MUST NOT cause a redirection without confirmation 556 # from the user (of urllib2, in this case). In practice, 557 # essentially all clients do redirect in this case, so we 558 # do the same. 559 # be conciliant with URIs containing a space 560 newurl = newurl.replace(' ', '%20') 561 newheaders = dict((k,v) for k,v in req.headers.items() 562 if k.lower() not in ("content-length", "content-type") 563 ) 564 return Request(newurl, 565 headers=newheaders, 566 origin_req_host=req.get_origin_req_host(), 567 unverifiable=True) 568 else: 569 raise HTTPError(req.get_full_url(), code, msg, headers, fp) 570 571 # Implementation note: To avoid the server sending us into an 572 # infinite loop, the request object needs to track what URLs we 573 # have already seen. Do this by adding a handler-specific 574 # attribute to the Request object. 575 def http_error_302(self, req, fp, code, msg, headers): 576 # Some servers (incorrectly) return multiple Location headers 577 # (so probably same goes for URI). Use first header. 578 if 'location' in headers: 579 newurl = headers.getheaders('location')[0] 580 elif 'uri' in headers: 581 newurl = headers.getheaders('uri')[0] 582 else: 583 return 584 585 # fix a possible malformed URL 586 urlparts = urlparse.urlparse(newurl) 587 if not urlparts.path: 588 urlparts = list(urlparts) 589 urlparts[2] = "/" 590 newurl = urlparse.urlunparse(urlparts) 591 592 newurl = urlparse.urljoin(req.get_full_url(), newurl) 593 594 # For security reasons we do not allow redirects to protocols 595 # other than HTTP, HTTPS or FTP. 596 newurl_lower = newurl.lower() 597 if not (newurl_lower.startswith('http://') or 598 newurl_lower.startswith('https://') or 599 newurl_lower.startswith('ftp://')): 600 raise HTTPError(newurl, code, 601 msg + " - Redirection to url '%s' is not allowed" % 602 newurl, 603 headers, fp) 604 605 # XXX Probably want to forget about the state of the current 606 # request, although that might interact poorly with other 607 # handlers that also use handler-specific request attributes 608 new = self.redirect_request(req, fp, code, msg, headers, newurl) 609 if new is None: 610 return 611 612 # loop detection 613 # .redirect_dict has a key url if url was previously visited. 614 if hasattr(req, 'redirect_dict'): 615 visited = new.redirect_dict = req.redirect_dict 616 if (visited.get(newurl, 0) >= self.max_repeats or 617 len(visited) >= self.max_redirections): 618 raise HTTPError(req.get_full_url(), code, 619 self.inf_msg + msg, headers, fp) 620 else: 621 visited = new.redirect_dict = req.redirect_dict = {} 622 visited[newurl] = visited.get(newurl, 0) + 1 623 624 # Don't close the fp until we are sure that we won't use it 625 # with HTTPError. 626 fp.read() 627 fp.close() 628 629 return self.parent.open(new, timeout=req.timeout) 630 631 http_error_301 = http_error_303 = http_error_307 = http_error_302 632 633 inf_msg = "The HTTP server returned a redirect error that would " \ 634 "lead to an infinite loop.\n" \ 635 "The last 30x error message was:\n" 636 637 638 def _parse_proxy(proxy): 639 """Return (scheme, user, password, host/port) given a URL or an authority. 640 641 If a URL is supplied, it must have an authority (host:port) component. 642 According to RFC 3986, having an authority component means the URL must 643 have two slashes after the scheme: 644 645 >>> _parse_proxy('file:/ftp.example.com/') 646 Traceback (most recent call last): 647 ValueError: proxy URL with no authority: 'file:/ftp.example.com/' 648 649 The first three items of the returned tuple may be None. 650 651 Examples of authority parsing: 652 653 >>> _parse_proxy('proxy.example.com') 654 (None, None, None, 'proxy.example.com') 655 >>> _parse_proxy('proxy.example.com:3128') 656 (None, None, None, 'proxy.example.com:3128') 657 658 The authority component may optionally include userinfo (assumed to be 659 username:password): 660 661 >>> _parse_proxy('joe:password (at] proxy.example.com') 662 (None, 'joe', 'password', 'proxy.example.com') 663 >>> _parse_proxy('joe:password (at] proxy.example.com:3128') 664 (None, 'joe', 'password', 'proxy.example.com:3128') 665 666 Same examples, but with URLs instead: 667 668 >>> _parse_proxy('http://proxy.example.com/') 669 ('http', None, None, 'proxy.example.com') 670 >>> _parse_proxy('http://proxy.example.com:3128/') 671 ('http', None, None, 'proxy.example.com:3128') 672 >>> _parse_proxy('http://joe:password@proxy.example.com/') 673 ('http', 'joe', 'password', 'proxy.example.com') 674 >>> _parse_proxy('http://joe:password@proxy.example.com:3128') 675 ('http', 'joe', 'password', 'proxy.example.com:3128') 676 677 Everything after the authority is ignored: 678 679 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') 680 ('ftp', 'joe', 'password', 'proxy.example.com') 681 682 Test for no trailing '/' case: 683 684 >>> _parse_proxy('http://joe:password@proxy.example.com') 685 ('http', 'joe', 'password', 'proxy.example.com') 686 687 """ 688 scheme, r_scheme = splittype(proxy) 689 if not r_scheme.startswith("/"): 690 # authority 691 scheme = None 692 authority = proxy 693 else: 694 # URL 695 if not r_scheme.startswith("//"): 696 raise ValueError("proxy URL with no authority: %r" % proxy) 697 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 698 # and 3.3.), path is empty or starts with '/' 699 end = r_scheme.find("/", 2) 700 if end == -1: 701 end = None 702 authority = r_scheme[2:end] 703 userinfo, hostport = splituser(authority) 704 if userinfo is not None: 705 user, password = splitpasswd(userinfo) 706 else: 707 user = password = None 708 return scheme, user, password, hostport 709 710 class ProxyHandler(BaseHandler): 711 # Proxies must be in front 712 handler_order = 100 713 714 def __init__(self, proxies=None): 715 if proxies is None: 716 proxies = getproxies() 717 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 718 self.proxies = proxies 719 for type, url in proxies.items(): 720 setattr(self, '%s_open' % type, 721 lambda r, proxy=url, type=type, meth=self.proxy_open: \ 722 meth(r, proxy, type)) 723 724 def proxy_open(self, req, proxy, type): 725 orig_type = req.get_type() 726 proxy_type, user, password, hostport = _parse_proxy(proxy) 727 728 if proxy_type is None: 729 proxy_type = orig_type 730 731 if req.host and proxy_bypass(req.host): 732 return None 733 734 if user and password: 735 user_pass = '%s:%s' % (unquote(user), unquote(password)) 736 creds = base64.b64encode(user_pass).strip() 737 req.add_header('Proxy-authorization', 'Basic ' + creds) 738 hostport = unquote(hostport) 739 req.set_proxy(hostport, proxy_type) 740 741 if orig_type == proxy_type or orig_type == 'https': 742 # let other handlers take care of it 743 return None 744 else: 745 # need to start over, because the other handlers don't 746 # grok the proxy's URL type 747 # e.g. if we have a constructor arg proxies like so: 748 # {'http': 'ftp://proxy.example.com'}, we may end up turning 749 # a request for http://acme.example.com/a into one for 750 # ftp://proxy.example.com/a 751 return self.parent.open(req, timeout=req.timeout) 752 753 class HTTPPasswordMgr: 754 755 def __init__(self): 756 self.passwd = {} 757 758 def add_password(self, realm, uri, user, passwd): 759 # uri could be a single URI or a sequence 760 if isinstance(uri, basestring): 761 uri = [uri] 762 if not realm in self.passwd: 763 self.passwd[realm] = {} 764 for default_port in True, False: 765 reduced_uri = tuple( 766 [self.reduce_uri(u, default_port) for u in uri]) 767 self.passwd[realm][reduced_uri] = (user, passwd) 768 769 def find_user_password(self, realm, authuri): 770 domains = self.passwd.get(realm, {}) 771 for default_port in True, False: 772 reduced_authuri = self.reduce_uri(authuri, default_port) 773 for uris, authinfo in domains.iteritems(): 774 for uri in uris: 775 if self.is_suburi(uri, reduced_authuri): 776 return authinfo 777 return None, None 778 779 def reduce_uri(self, uri, default_port=True): 780 """Accept authority or URI and extract only the authority and path.""" 781 # note HTTP URLs do not have a userinfo component 782 parts = urlparse.urlsplit(uri) 783 if parts[1]: 784 # URI 785 scheme = parts[0] 786 authority = parts[1] 787 path = parts[2] or '/' 788 else: 789 # host or host:port 790 scheme = None 791 authority = uri 792 path = '/' 793 host, port = splitport(authority) 794 if default_port and port is None and scheme is not None: 795 dport = {"http": 80, 796 "https": 443, 797 }.get(scheme) 798 if dport is not None: 799 authority = "%s:%d" % (host, dport) 800 return authority, path 801 802 def is_suburi(self, base, test): 803 """Check if test is below base in a URI tree 804 805 Both args must be URIs in reduced form. 806 """ 807 if base == test: 808 return True 809 if base[0] != test[0]: 810 return False 811 common = posixpath.commonprefix((base[1], test[1])) 812 if len(common) == len(base[1]): 813 return True 814 return False 815 816 817 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 818 819 def find_user_password(self, realm, authuri): 820 user, password = HTTPPasswordMgr.find_user_password(self, realm, 821 authuri) 822 if user is not None: 823 return user, password 824 return HTTPPasswordMgr.find_user_password(self, None, authuri) 825 826 827 class AbstractBasicAuthHandler: 828 829 # XXX this allows for multiple auth-schemes, but will stupidly pick 830 # the last one with a realm specified. 831 832 # allow for double- and single-quoted realm values 833 # (single quotes are a violation of the RFC, but appear in the wild) 834 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 835 'realm=(["\']?)([^"\']*)\\2', re.I) 836 837 # XXX could pre-emptively send auth info already accepted (RFC 2617, 838 # end of section 2, and section 1.2 immediately after "credentials" 839 # production). 840 841 def __init__(self, password_mgr=None): 842 if password_mgr is None: 843 password_mgr = HTTPPasswordMgr() 844 self.passwd = password_mgr 845 self.add_password = self.passwd.add_password 846 self.retried = 0 847 848 def reset_retry_count(self): 849 self.retried = 0 850 851 def http_error_auth_reqed(self, authreq, host, req, headers): 852 # host may be an authority (without userinfo) or a URL with an 853 # authority 854 # XXX could be multiple headers 855 authreq = headers.get(authreq, None) 856 857 if self.retried > 5: 858 # retry sending the username:password 5 times before failing. 859 raise HTTPError(req.get_full_url(), 401, "basic auth failed", 860 headers, None) 861 else: 862 self.retried += 1 863 864 if authreq: 865 mo = AbstractBasicAuthHandler.rx.search(authreq) 866 if mo: 867 scheme, quote, realm = mo.groups() 868 if quote not in ['"', "'"]: 869 warnings.warn("Basic Auth Realm was unquoted", 870 UserWarning, 2) 871 if scheme.lower() == 'basic': 872 response = self.retry_http_basic_auth(host, req, realm) 873 if response and response.code != 401: 874 self.retried = 0 875 return response 876 877 def retry_http_basic_auth(self, host, req, realm): 878 user, pw = self.passwd.find_user_password(realm, host) 879 if pw is not None: 880 raw = "%s:%s" % (user, pw) 881 auth = 'Basic %s' % base64.b64encode(raw).strip() 882 if req.headers.get(self.auth_header, None) == auth: 883 return None 884 req.add_unredirected_header(self.auth_header, auth) 885 return self.parent.open(req, timeout=req.timeout) 886 else: 887 return None 888 889 890 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 891 892 auth_header = 'Authorization' 893 894 def http_error_401(self, req, fp, code, msg, headers): 895 url = req.get_full_url() 896 response = self.http_error_auth_reqed('www-authenticate', 897 url, req, headers) 898 self.reset_retry_count() 899 return response 900 901 902 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 903 904 auth_header = 'Proxy-authorization' 905 906 def http_error_407(self, req, fp, code, msg, headers): 907 # http_error_auth_reqed requires that there is no userinfo component in 908 # authority. Assume there isn't one, since urllib2 does not (and 909 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 910 # userinfo. 911 authority = req.get_host() 912 response = self.http_error_auth_reqed('proxy-authenticate', 913 authority, req, headers) 914 self.reset_retry_count() 915 return response 916 917 918 def randombytes(n): 919 """Return n random bytes.""" 920 # Use /dev/urandom if it is available. Fall back to random module 921 # if not. It might be worthwhile to extend this function to use 922 # other platform-specific mechanisms for getting random bytes. 923 if os.path.exists("/dev/urandom"): 924 f = open("/dev/urandom") 925 s = f.read(n) 926 f.close() 927 return s 928 else: 929 L = [chr(random.randrange(0, 256)) for i in range(n)] 930 return "".join(L) 931 932 class AbstractDigestAuthHandler: 933 # Digest authentication is specified in RFC 2617. 934 935 # XXX The client does not inspect the Authentication-Info header 936 # in a successful response. 937 938 # XXX It should be possible to test this implementation against 939 # a mock server that just generates a static set of challenges. 940 941 # XXX qop="auth-int" supports is shaky 942 943 def __init__(self, passwd=None): 944 if passwd is None: 945 passwd = HTTPPasswordMgr() 946 self.passwd = passwd 947 self.add_password = self.passwd.add_password 948 self.retried = 0 949 self.nonce_count = 0 950 self.last_nonce = None 951 952 def reset_retry_count(self): 953 self.retried = 0 954 955 def http_error_auth_reqed(self, auth_header, host, req, headers): 956 authreq = headers.get(auth_header, None) 957 if self.retried > 5: 958 # Don't fail endlessly - if we failed once, we'll probably 959 # fail a second time. Hm. Unless the Password Manager is 960 # prompting for the information. Crap. This isn't great 961 # but it's better than the current 'repeat until recursion 962 # depth exceeded' approach <wink> 963 raise HTTPError(req.get_full_url(), 401, "digest auth failed", 964 headers, None) 965 else: 966 self.retried += 1 967 if authreq: 968 scheme = authreq.split()[0] 969 if scheme.lower() == 'digest': 970 return self.retry_http_digest_auth(req, authreq) 971 972 def retry_http_digest_auth(self, req, auth): 973 token, challenge = auth.split(' ', 1) 974 chal = parse_keqv_list(parse_http_list(challenge)) 975 auth = self.get_authorization(req, chal) 976 if auth: 977 auth_val = 'Digest %s' % auth 978 if req.headers.get(self.auth_header, None) == auth_val: 979 return None 980 req.add_unredirected_header(self.auth_header, auth_val) 981 resp = self.parent.open(req, timeout=req.timeout) 982 return resp 983 984 def get_cnonce(self, nonce): 985 # The cnonce-value is an opaque 986 # quoted string value provided by the client and used by both client 987 # and server to avoid chosen plaintext attacks, to provide mutual 988 # authentication, and to provide some message integrity protection. 989 # This isn't a fabulous effort, but it's probably Good Enough. 990 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), 991 randombytes(8))).hexdigest() 992 return dig[:16] 993 994 def get_authorization(self, req, chal): 995 try: 996 realm = chal['realm'] 997 nonce = chal['nonce'] 998 qop = chal.get('qop') 999 algorithm = chal.get('algorithm', 'MD5') 1000 # mod_digest doesn't send an opaque, even though it isn't 1001 # supposed to be optional 1002 opaque = chal.get('opaque', None) 1003 except KeyError: 1004 return None 1005 1006 H, KD = self.get_algorithm_impls(algorithm) 1007 if H is None: 1008 return None 1009 1010 user, pw = self.passwd.find_user_password(realm, req.get_full_url()) 1011 if user is None: 1012 return None 1013 1014 # XXX not implemented yet 1015 if req.has_data(): 1016 entdig = self.get_entity_digest(req.get_data(), chal) 1017 else: 1018 entdig = None 1019 1020 A1 = "%s:%s:%s" % (user, realm, pw) 1021 A2 = "%s:%s" % (req.get_method(), 1022 # XXX selector: what about proxies and full urls 1023 req.get_selector()) 1024 if qop == 'auth': 1025 if nonce == self.last_nonce: 1026 self.nonce_count += 1 1027 else: 1028 self.nonce_count = 1 1029 self.last_nonce = nonce 1030 1031 ncvalue = '%08x' % self.nonce_count 1032 cnonce = self.get_cnonce(nonce) 1033 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 1034 respdig = KD(H(A1), noncebit) 1035 elif qop is None: 1036 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1037 else: 1038 # XXX handle auth-int. 1039 raise URLError("qop '%s' is not supported." % qop) 1040 1041 # XXX should the partial digests be encoded too? 1042 1043 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1044 'response="%s"' % (user, realm, nonce, req.get_selector(), 1045 respdig) 1046 if opaque: 1047 base += ', opaque="%s"' % opaque 1048 if entdig: 1049 base += ', digest="%s"' % entdig 1050 base += ', algorithm="%s"' % algorithm 1051 if qop: 1052 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1053 return base 1054 1055 def get_algorithm_impls(self, algorithm): 1056 # algorithm should be case-insensitive according to RFC2617 1057 algorithm = algorithm.upper() 1058 # lambdas assume digest modules are imported at the top level 1059 if algorithm == 'MD5': 1060 H = lambda x: hashlib.md5(x).hexdigest() 1061 elif algorithm == 'SHA': 1062 H = lambda x: hashlib.sha1(x).hexdigest() 1063 # XXX MD5-sess 1064 KD = lambda s, d: H("%s:%s" % (s, d)) 1065 return H, KD 1066 1067 def get_entity_digest(self, data, chal): 1068 # XXX not implemented yet 1069 return None 1070 1071 1072 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1073 """An authentication protocol defined by RFC 2069 1074 1075 Digest authentication improves on basic authentication because it 1076 does not transmit passwords in the clear. 1077 """ 1078 1079 auth_header = 'Authorization' 1080 handler_order = 490 # before Basic auth 1081 1082 def http_error_401(self, req, fp, code, msg, headers): 1083 host = urlparse.urlparse(req.get_full_url())[1] 1084 retry = self.http_error_auth_reqed('www-authenticate', 1085 host, req, headers) 1086 self.reset_retry_count() 1087 return retry 1088 1089 1090 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1091 1092 auth_header = 'Proxy-Authorization' 1093 handler_order = 490 # before Basic auth 1094 1095 def http_error_407(self, req, fp, code, msg, headers): 1096 host = req.get_host() 1097 retry = self.http_error_auth_reqed('proxy-authenticate', 1098 host, req, headers) 1099 self.reset_retry_count() 1100 return retry 1101 1102 class AbstractHTTPHandler(BaseHandler): 1103 1104 def __init__(self, debuglevel=0): 1105 self._debuglevel = debuglevel 1106 1107 def set_http_debuglevel(self, level): 1108 self._debuglevel = level 1109 1110 def do_request_(self, request): 1111 host = request.get_host() 1112 if not host: 1113 raise URLError('no host given') 1114 1115 if request.has_data(): # POST 1116 data = request.get_data() 1117 if not request.has_header('Content-type'): 1118 request.add_unredirected_header( 1119 'Content-type', 1120 'application/x-www-form-urlencoded') 1121 if not request.has_header('Content-length'): 1122 request.add_unredirected_header( 1123 'Content-length', '%d' % len(data)) 1124 1125 sel_host = host 1126 if request.has_proxy(): 1127 scheme, sel = splittype(request.get_selector()) 1128 sel_host, sel_path = splithost(sel) 1129 1130 if not request.has_header('Host'): 1131 request.add_unredirected_header('Host', sel_host) 1132 for name, value in self.parent.addheaders: 1133 name = name.capitalize() 1134 if not request.has_header(name): 1135 request.add_unredirected_header(name, value) 1136 1137 return request 1138 1139 def do_open(self, http_class, req): 1140 """Return an addinfourl object for the request, using http_class. 1141 1142 http_class must implement the HTTPConnection API from httplib. 1143 The addinfourl return value is a file-like object. It also 1144 has methods and attributes including: 1145 - info(): return a mimetools.Message object for the headers 1146 - geturl(): return the original request URL 1147 - code: HTTP status code 1148 """ 1149 host = req.get_host() 1150 if not host: 1151 raise URLError('no host given') 1152 1153 h = http_class(host, timeout=req.timeout) # will parse host:port 1154 h.set_debuglevel(self._debuglevel) 1155 1156 headers = dict(req.unredirected_hdrs) 1157 headers.update(dict((k, v) for k, v in req.headers.items() 1158 if k not in headers)) 1159 1160 # We want to make an HTTP/1.1 request, but the addinfourl 1161 # class isn't prepared to deal with a persistent connection. 1162 # It will try to read all remaining data from the socket, 1163 # which will block while the server waits for the next request. 1164 # So make sure the connection gets closed after the (only) 1165 # request. 1166 headers["Connection"] = "close" 1167 headers = dict( 1168 (name.title(), val) for name, val in headers.items()) 1169 1170 if req._tunnel_host: 1171 tunnel_headers = {} 1172 proxy_auth_hdr = "Proxy-Authorization" 1173 if proxy_auth_hdr in headers: 1174 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1175 # Proxy-Authorization should not be sent to origin 1176 # server. 1177 del headers[proxy_auth_hdr] 1178 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1179 1180 try: 1181 h.request(req.get_method(), req.get_selector(), req.data, headers) 1182 except socket.error, err: # XXX what error? 1183 h.close() 1184 raise URLError(err) 1185 else: 1186 try: 1187 r = h.getresponse(buffering=True) 1188 except TypeError: # buffering kw not supported 1189 r = h.getresponse() 1190 1191 # Pick apart the HTTPResponse object to get the addinfourl 1192 # object initialized properly. 1193 1194 # Wrap the HTTPResponse object in socket's file object adapter 1195 # for Windows. That adapter calls recv(), so delegate recv() 1196 # to read(). This weird wrapping allows the returned object to 1197 # have readline() and readlines() methods. 1198 1199 # XXX It might be better to extract the read buffering code 1200 # out of socket._fileobject() and into a base class. 1201 1202 r.recv = r.read 1203 fp = socket._fileobject(r, close=True) 1204 1205 resp = addinfourl(fp, r.msg, req.get_full_url()) 1206 resp.code = r.status 1207 resp.msg = r.reason 1208 return resp 1209 1210 1211 class HTTPHandler(AbstractHTTPHandler): 1212 1213 def http_open(self, req): 1214 return self.do_open(httplib.HTTPConnection, req) 1215 1216 http_request = AbstractHTTPHandler.do_request_ 1217 1218 if hasattr(httplib, 'HTTPS'): 1219 class HTTPSHandler(AbstractHTTPHandler): 1220 1221 def https_open(self, req): 1222 return self.do_open(httplib.HTTPSConnection, req) 1223 1224 https_request = AbstractHTTPHandler.do_request_ 1225 1226 class HTTPCookieProcessor(BaseHandler): 1227 def __init__(self, cookiejar=None): 1228 import cookielib 1229 if cookiejar is None: 1230 cookiejar = cookielib.CookieJar() 1231 self.cookiejar = cookiejar 1232 1233 def http_request(self, request): 1234 self.cookiejar.add_cookie_header(request) 1235 return request 1236 1237 def http_response(self, request, response): 1238 self.cookiejar.extract_cookies(response, request) 1239 return response 1240 1241 https_request = http_request 1242 https_response = http_response 1243 1244 class UnknownHandler(BaseHandler): 1245 def unknown_open(self, req): 1246 type = req.get_type() 1247 raise URLError('unknown url type: %s' % type) 1248 1249 def parse_keqv_list(l): 1250 """Parse list of key=value strings where keys are not duplicated.""" 1251 parsed = {} 1252 for elt in l: 1253 k, v = elt.split('=', 1) 1254 if v[0] == '"' and v[-1] == '"': 1255 v = v[1:-1] 1256 parsed[k] = v 1257 return parsed 1258 1259 def parse_http_list(s): 1260 """Parse lists as described by RFC 2068 Section 2. 1261 1262 In particular, parse comma-separated lists where the elements of 1263 the list may include quoted-strings. A quoted-string could 1264 contain a comma. A non-quoted string could have quotes in the 1265 middle. Neither commas nor quotes count if they are escaped. 1266 Only double-quotes count, not single-quotes. 1267 """ 1268 res = [] 1269 part = '' 1270 1271 escape = quote = False 1272 for cur in s: 1273 if escape: 1274 part += cur 1275 escape = False 1276 continue 1277 if quote: 1278 if cur == '\\': 1279 escape = True 1280 continue 1281 elif cur == '"': 1282 quote = False 1283 part += cur 1284 continue 1285 1286 if cur == ',': 1287 res.append(part) 1288 part = '' 1289 continue 1290 1291 if cur == '"': 1292 quote = True 1293 1294 part += cur 1295 1296 # append last part 1297 if part: 1298 res.append(part) 1299 1300 return [part.strip() for part in res] 1301 1302 def _safe_gethostbyname(host): 1303 try: 1304 return socket.gethostbyname(host) 1305 except socket.gaierror: 1306 return None 1307 1308 class FileHandler(BaseHandler): 1309 # Use local file or FTP depending on form of URL 1310 def file_open(self, req): 1311 url = req.get_selector() 1312 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1313 req.host != 'localhost'): 1314 req.type = 'ftp' 1315 return self.parent.open(req) 1316 else: 1317 return self.open_local_file(req) 1318 1319 # names for the localhost 1320 names = None 1321 def get_names(self): 1322 if FileHandler.names is None: 1323 try: 1324 FileHandler.names = tuple( 1325 socket.gethostbyname_ex('localhost')[2] + 1326 socket.gethostbyname_ex(socket.gethostname())[2]) 1327 except socket.gaierror: 1328 FileHandler.names = (socket.gethostbyname('localhost'),) 1329 return FileHandler.names 1330 1331 # not entirely sure what the rules are here 1332 def open_local_file(self, req): 1333 import email.utils 1334 import mimetypes 1335 host = req.get_host() 1336 filename = req.get_selector() 1337 localfile = url2pathname(filename) 1338 try: 1339 stats = os.stat(localfile) 1340 size = stats.st_size 1341 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1342 mtype = mimetypes.guess_type(filename)[0] 1343 headers = mimetools.Message(StringIO( 1344 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1345 (mtype or 'text/plain', size, modified))) 1346 if host: 1347 host, port = splitport(host) 1348 if not host or \ 1349 (not port and _safe_gethostbyname(host) in self.get_names()): 1350 if host: 1351 origurl = 'file://' + host + filename 1352 else: 1353 origurl = 'file://' + filename 1354 return addinfourl(open(localfile, 'rb'), headers, origurl) 1355 except OSError, msg: 1356 # urllib2 users shouldn't expect OSErrors coming from urlopen() 1357 raise URLError(msg) 1358 raise URLError('file not on local host') 1359 1360 class FTPHandler(BaseHandler): 1361 def ftp_open(self, req): 1362 import ftplib 1363 import mimetypes 1364 host = req.get_host() 1365 if not host: 1366 raise URLError('ftp error: no host given') 1367 host, port = splitport(host) 1368 if port is None: 1369 port = ftplib.FTP_PORT 1370 else: 1371 port = int(port) 1372 1373 # username/password handling 1374 user, host = splituser(host) 1375 if user: 1376 user, passwd = splitpasswd(user) 1377 else: 1378 passwd = None 1379 host = unquote(host) 1380 user = user or '' 1381 passwd = passwd or '' 1382 1383 try: 1384 host = socket.gethostbyname(host) 1385 except socket.error, msg: 1386 raise URLError(msg) 1387 path, attrs = splitattr(req.get_selector()) 1388 dirs = path.split('/') 1389 dirs = map(unquote, dirs) 1390 dirs, file = dirs[:-1], dirs[-1] 1391 if dirs and not dirs[0]: 1392 dirs = dirs[1:] 1393 try: 1394 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1395 type = file and 'I' or 'D' 1396 for attr in attrs: 1397 attr, value = splitvalue(attr) 1398 if attr.lower() == 'type' and \ 1399 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1400 type = value.upper() 1401 fp, retrlen = fw.retrfile(file, type) 1402 headers = "" 1403 mtype = mimetypes.guess_type(req.get_full_url())[0] 1404 if mtype: 1405 headers += "Content-type: %s\n" % mtype 1406 if retrlen is not None and retrlen >= 0: 1407 headers += "Content-length: %d\n" % retrlen 1408 sf = StringIO(headers) 1409 headers = mimetools.Message(sf) 1410 return addinfourl(fp, headers, req.get_full_url()) 1411 except ftplib.all_errors, msg: 1412 raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] 1413 1414 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1415 fw = ftpwrapper(user, passwd, host, port, dirs, timeout, 1416 persistent=False) 1417 ## fw.ftp.set_debuglevel(1) 1418 return fw 1419 1420 class CacheFTPHandler(FTPHandler): 1421 # XXX would be nice to have pluggable cache strategies 1422 # XXX this stuff is definitely not thread safe 1423 def __init__(self): 1424 self.cache = {} 1425 self.timeout = {} 1426 self.soonest = 0 1427 self.delay = 60 1428 self.max_conns = 16 1429 1430 def setTimeout(self, t): 1431 self.delay = t 1432 1433 def setMaxConns(self, m): 1434 self.max_conns = m 1435 1436 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1437 key = user, host, port, '/'.join(dirs), timeout 1438 if key in self.cache: 1439 self.timeout[key] = time.time() + self.delay 1440 else: 1441 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) 1442 self.timeout[key] = time.time() + self.delay 1443 self.check_cache() 1444 return self.cache[key] 1445 1446 def check_cache(self): 1447 # first check for old ones 1448 t = time.time() 1449 if self.soonest <= t: 1450 for k, v in self.timeout.items(): 1451 if v < t: 1452 self.cache[k].close() 1453 del self.cache[k] 1454 del self.timeout[k] 1455 self.soonest = min(self.timeout.values()) 1456 1457 # then check the size 1458 if len(self.cache) == self.max_conns: 1459 for k, v in self.timeout.items(): 1460 if v == self.soonest: 1461 del self.cache[k] 1462 del self.timeout[k] 1463 break 1464 self.soonest = min(self.timeout.values()) 1465 1466 def clear_cache(self): 1467 for conn in self.cache.values(): 1468 conn.close() 1469 self.cache.clear() 1470 self.timeout.clear() 1471