1 """An extensible library for opening URLs using a variety of protocols 2 3 The simplest way to use this module is to call the urlopen function, 4 which accepts a string containing a URL or a Request object (described 5 below). It opens the URL and returns the results as file-like 6 object; the returned object has some extra methods described below. 7 8 The OpenerDirector manages a collection of Handler objects that do 9 all the actual work. Each Handler implements a particular protocol or 10 option. The OpenerDirector is a composite object that invokes the 11 Handlers needed to open the requested URL. For example, the 12 HTTPHandler performs HTTP GET and POST requests and deals with 13 non-error returns. The HTTPRedirectHandler automatically deals with 14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15 deals with digest authentication. 16 17 urlopen(url, data=None) -- Basic usage is the same as original 18 urllib. pass the url and optionally data to post to an HTTP URL, and 19 get a file-like object back. One difference is that you can also pass 20 a Request instance instead of URL. Raises a URLError (subclass of 21 IOError); for HTTP errors, raises an HTTPError, which can also be 22 treated as a valid response. 23 24 build_opener -- Function that creates a new OpenerDirector instance. 25 Will install the default handlers. Accepts one or more Handlers as 26 arguments, either instances or Handler classes that it will 27 instantiate. If one of the argument is a subclass of the default 28 handler, the argument will be installed instead of the default. 29 30 install_opener -- Installs a new opener as the default opener. 31 32 objects of interest: 33 34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35 the Handler classes, while dealing with requests and responses. 36 37 Request -- An object that encapsulates the state of a request. The 38 state can be as simple as the URL. It can also include extra HTTP 39 headers, e.g. a User-Agent. 40 41 BaseHandler -- 42 43 exceptions: 44 URLError -- A subclass of IOError, individual protocols have their own 45 specific subclass. 46 47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error 48 as an exceptional event or valid response. 49 50 internals: 51 BaseHandler and parent 52 _call_chain conventions 53 54 Example usage: 55 56 import urllib2 57 58 # set up authentication info 59 authinfo = urllib2.HTTPBasicAuthHandler() 60 authinfo.add_password(realm='PDQ Application', 61 uri='https://mahler:8092/site-updates.py', 62 user='klem', 63 passwd='geheim$parole') 64 65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) 66 67 # build a new opener that adds authentication and caching FTP handlers 68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) 69 70 # install it 71 urllib2.install_opener(opener) 72 73 f = urllib2.urlopen('http://www.python.org/') 74 75 76 """ 77 78 # XXX issues: 79 # If an authentication error handler that tries to perform 80 # authentication for some reason but fails, how should the error be 81 # signalled? The client needs to know the HTTP error code. But if 82 # the handler knows that the problem was, e.g., that it didn't know 83 # that hash algo that requested in the challenge, it would be good to 84 # pass that information along to the client, too. 85 # ftp errors aren't handled cleanly 86 # check digest against correct (i.e. non-apache) implementation 87 88 # Possible extensions: 89 # complex proxies XXX not sure what exactly was meant by this 90 # abstract factory for opener 91 92 import base64 93 import hashlib 94 import httplib 95 import mimetools 96 import os 97 import posixpath 98 import random 99 import re 100 import socket 101 import sys 102 import time 103 import urlparse 104 import bisect 105 106 try: 107 from cStringIO import StringIO 108 except ImportError: 109 from StringIO import StringIO 110 111 from urllib import (unwrap, unquote, splittype, splithost, quote, 112 addinfourl, splitport, splittag, 113 splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) 114 115 # support for FileHandler, proxies via environment variables 116 from urllib import localhost, url2pathname, getproxies, proxy_bypass 117 118 # used in User-Agent header sent 119 __version__ = sys.version[:3] 120 121 _opener = None 122 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 123 global _opener 124 if _opener is None: 125 _opener = build_opener() 126 return _opener.open(url, data, timeout) 127 128 def install_opener(opener): 129 global _opener 130 _opener = opener 131 132 # do these error classes make sense? 133 # make sure all of the IOError stuff is overridden. we just want to be 134 # subtypes. 135 136 class URLError(IOError): 137 # URLError is a sub-type of IOError, but it doesn't share any of 138 # the implementation. need to override __init__ and __str__. 139 # It sets self.args for compatibility with other EnvironmentError 140 # subclasses, but args doesn't have the typical format with errno in 141 # slot 0 and strerror in slot 1. This may be better than nothing. 142 def __init__(self, reason): 143 self.args = reason, 144 self.reason = reason 145 146 def __str__(self): 147 return '<urlopen error %s>' % self.reason 148 149 class HTTPError(URLError, addinfourl): 150 """Raised when HTTP error occurs, but also acts like non-error return""" 151 __super_init = addinfourl.__init__ 152 153 def __init__(self, url, code, msg, hdrs, fp): 154 self.code = code 155 self.msg = msg 156 self.hdrs = hdrs 157 self.fp = fp 158 self.filename = url 159 # The addinfourl classes depend on fp being a valid file 160 # object. In some cases, the HTTPError may not have a valid 161 # file object. If this happens, the simplest workaround is to 162 # not initialize the base classes. 163 if fp is not None: 164 self.__super_init(fp, hdrs, url, code) 165 166 def __str__(self): 167 return 'HTTP Error %s: %s' % (self.code, self.msg) 168 169 # copied from cookielib.py 170 _cut_port_re = re.compile(r":\d+$") 171 def request_host(request): 172 """Return request-host, as defined by RFC 2965. 173 174 Variation from RFC: returned value is lowercased, for convenient 175 comparison. 176 177 """ 178 url = request.get_full_url() 179 host = urlparse.urlparse(url)[1] 180 if host == "": 181 host = request.get_header("Host", "") 182 183 # remove port, if present 184 host = _cut_port_re.sub("", host, 1) 185 return host.lower() 186 187 class Request: 188 189 def __init__(self, url, data=None, headers={}, 190 origin_req_host=None, unverifiable=False): 191 # unwrap('<URL:type://host/path>') --> 'type://host/path' 192 self.__original = unwrap(url) 193 self.__original, self.__fragment = splittag(self.__original) 194 self.type = None 195 # self.__r_type is what's left after doing the splittype 196 self.host = None 197 self.port = None 198 self._tunnel_host = None 199 self.data = data 200 self.headers = {} 201 for key, value in headers.items(): 202 self.add_header(key, value) 203 self.unredirected_hdrs = {} 204 if origin_req_host is None: 205 origin_req_host = request_host(self) 206 self.origin_req_host = origin_req_host 207 self.unverifiable = unverifiable 208 209 def __getattr__(self, attr): 210 # XXX this is a fallback mechanism to guard against these 211 # methods getting called in a non-standard order. this may be 212 # too complicated and/or unnecessary. 213 # XXX should the __r_XXX attributes be public? 214 if attr[:12] == '_Request__r_': 215 name = attr[12:] 216 if hasattr(Request, 'get_' + name): 217 getattr(self, 'get_' + name)() 218 return getattr(self, attr) 219 raise AttributeError, attr 220 221 def get_method(self): 222 if self.has_data(): 223 return "POST" 224 else: 225 return "GET" 226 227 # XXX these helper methods are lame 228 229 def add_data(self, data): 230 self.data = data 231 232 def has_data(self): 233 return self.data is not None 234 235 def get_data(self): 236 return self.data 237 238 def get_full_url(self): 239 if self.__fragment: 240 return '%s#%s' % (self.__original, self.__fragment) 241 else: 242 return self.__original 243 244 def get_type(self): 245 if self.type is None: 246 self.type, self.__r_type = splittype(self.__original) 247 if self.type is None: 248 raise ValueError, "unknown url type: %s" % self.__original 249 return self.type 250 251 def get_host(self): 252 if self.host is None: 253 self.host, self.__r_host = splithost(self.__r_type) 254 if self.host: 255 self.host = unquote(self.host) 256 return self.host 257 258 def get_selector(self): 259 return self.__r_host 260 261 def set_proxy(self, host, type): 262 if self.type == 'https' and not self._tunnel_host: 263 self._tunnel_host = self.host 264 else: 265 self.type = type 266 self.__r_host = self.__original 267 268 self.host = host 269 270 def has_proxy(self): 271 return self.__r_host == self.__original 272 273 def get_origin_req_host(self): 274 return self.origin_req_host 275 276 def is_unverifiable(self): 277 return self.unverifiable 278 279 def add_header(self, key, val): 280 # useful for something like authentication 281 self.headers[key.capitalize()] = val 282 283 def add_unredirected_header(self, key, val): 284 # will not be added to a redirected request 285 self.unredirected_hdrs[key.capitalize()] = val 286 287 def has_header(self, header_name): 288 return (header_name in self.headers or 289 header_name in self.unredirected_hdrs) 290 291 def get_header(self, header_name, default=None): 292 return self.headers.get( 293 header_name, 294 self.unredirected_hdrs.get(header_name, default)) 295 296 def header_items(self): 297 hdrs = self.unredirected_hdrs.copy() 298 hdrs.update(self.headers) 299 return hdrs.items() 300 301 class OpenerDirector: 302 def __init__(self): 303 client_version = "Python-urllib/%s" % __version__ 304 self.addheaders = [('User-agent', client_version)] 305 # self.handlers is retained only for backward compatibility 306 self.handlers = [] 307 # manage the individual handlers 308 self.handle_open = {} 309 self.handle_error = {} 310 self.process_response = {} 311 self.process_request = {} 312 313 def add_handler(self, handler): 314 if not hasattr(handler, "add_parent"): 315 raise TypeError("expected BaseHandler instance, got %r" % 316 type(handler)) 317 318 added = False 319 for meth in dir(handler): 320 if meth in ["redirect_request", "do_open", "proxy_open"]: 321 # oops, coincidental match 322 continue 323 324 i = meth.find("_") 325 protocol = meth[:i] 326 condition = meth[i+1:] 327 328 if condition.startswith("error"): 329 j = condition.find("_") + i + 1 330 kind = meth[j+1:] 331 try: 332 kind = int(kind) 333 except ValueError: 334 pass 335 lookup = self.handle_error.get(protocol, {}) 336 self.handle_error[protocol] = lookup 337 elif condition == "open": 338 kind = protocol 339 lookup = self.handle_open 340 elif condition == "response": 341 kind = protocol 342 lookup = self.process_response 343 elif condition == "request": 344 kind = protocol 345 lookup = self.process_request 346 else: 347 continue 348 349 handlers = lookup.setdefault(kind, []) 350 if handlers: 351 bisect.insort(handlers, handler) 352 else: 353 handlers.append(handler) 354 added = True 355 356 if added: 357 bisect.insort(self.handlers, handler) 358 handler.add_parent(self) 359 360 def close(self): 361 # Only exists for backwards compatibility. 362 pass 363 364 def _call_chain(self, chain, kind, meth_name, *args): 365 # Handlers raise an exception if no one else should try to handle 366 # the request, or return None if they can't but another handler 367 # could. Otherwise, they return the response. 368 handlers = chain.get(kind, ()) 369 for handler in handlers: 370 func = getattr(handler, meth_name) 371 372 result = func(*args) 373 if result is not None: 374 return result 375 376 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 377 # accept a URL or a Request object 378 if isinstance(fullurl, basestring): 379 req = Request(fullurl, data) 380 else: 381 req = fullurl 382 if data is not None: 383 req.add_data(data) 384 385 req.timeout = timeout 386 protocol = req.get_type() 387 388 # pre-process request 389 meth_name = protocol+"_request" 390 for processor in self.process_request.get(protocol, []): 391 meth = getattr(processor, meth_name) 392 req = meth(req) 393 394 response = self._open(req, data) 395 396 # post-process response 397 meth_name = protocol+"_response" 398 for processor in self.process_response.get(protocol, []): 399 meth = getattr(processor, meth_name) 400 response = meth(req, response) 401 402 return response 403 404 def _open(self, req, data=None): 405 result = self._call_chain(self.handle_open, 'default', 406 'default_open', req) 407 if result: 408 return result 409 410 protocol = req.get_type() 411 result = self._call_chain(self.handle_open, protocol, protocol + 412 '_open', req) 413 if result: 414 return result 415 416 return self._call_chain(self.handle_open, 'unknown', 417 'unknown_open', req) 418 419 def error(self, proto, *args): 420 if proto in ('http', 'https'): 421 # XXX http[s] protocols are special-cased 422 dict = self.handle_error['http'] # https is not different than http 423 proto = args[2] # YUCK! 424 meth_name = 'http_error_%s' % proto 425 http_err = 1 426 orig_args = args 427 else: 428 dict = self.handle_error 429 meth_name = proto + '_error' 430 http_err = 0 431 args = (dict, proto, meth_name) + args 432 result = self._call_chain(*args) 433 if result: 434 return result 435 436 if http_err: 437 args = (dict, 'default', 'http_error_default') + orig_args 438 return self._call_chain(*args) 439 440 # XXX probably also want an abstract factory that knows when it makes 441 # sense to skip a superclass in favor of a subclass and when it might 442 # make sense to include both 443 444 def build_opener(*handlers): 445 """Create an opener object from a list of handlers. 446 447 The opener will use several default handlers, including support 448 for HTTP, FTP and when applicable, HTTPS. 449 450 If any of the handlers passed as arguments are subclasses of the 451 default handlers, the default handlers will not be used. 452 """ 453 import types 454 def isclass(obj): 455 return isinstance(obj, (types.ClassType, type)) 456 457 opener = OpenerDirector() 458 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 459 HTTPDefaultErrorHandler, HTTPRedirectHandler, 460 FTPHandler, FileHandler, HTTPErrorProcessor] 461 if hasattr(httplib, 'HTTPS'): 462 default_classes.append(HTTPSHandler) 463 skip = set() 464 for klass in default_classes: 465 for check in handlers: 466 if isclass(check): 467 if issubclass(check, klass): 468 skip.add(klass) 469 elif isinstance(check, klass): 470 skip.add(klass) 471 for klass in skip: 472 default_classes.remove(klass) 473 474 for klass in default_classes: 475 opener.add_handler(klass()) 476 477 for h in handlers: 478 if isclass(h): 479 h = h() 480 opener.add_handler(h) 481 return opener 482 483 class BaseHandler: 484 handler_order = 500 485 486 def add_parent(self, parent): 487 self.parent = parent 488 489 def close(self): 490 # Only exists for backwards compatibility 491 pass 492 493 def __lt__(self, other): 494 if not hasattr(other, "handler_order"): 495 # Try to preserve the old behavior of having custom classes 496 # inserted after default ones (works only for custom user 497 # classes which are not aware of handler_order). 498 return True 499 return self.handler_order < other.handler_order 500 501 502 class HTTPErrorProcessor(BaseHandler): 503 """Process HTTP error responses.""" 504 handler_order = 1000 # after all other processing 505 506 def http_response(self, request, response): 507 code, msg, hdrs = response.code, response.msg, response.info() 508 509 # According to RFC 2616, "2xx" code indicates that the client's 510 # request was successfully received, understood, and accepted. 511 if not (200 <= code < 300): 512 response = self.parent.error( 513 'http', request, response, code, msg, hdrs) 514 515 return response 516 517 https_response = http_response 518 519 class HTTPDefaultErrorHandler(BaseHandler): 520 def http_error_default(self, req, fp, code, msg, hdrs): 521 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) 522 523 class HTTPRedirectHandler(BaseHandler): 524 # maximum number of redirections to any single URL 525 # this is needed because of the state that cookies introduce 526 max_repeats = 4 527 # maximum total number of redirections (regardless of URL) before 528 # assuming we're in a loop 529 max_redirections = 10 530 531 def redirect_request(self, req, fp, code, msg, headers, newurl): 532 """Return a Request or None in response to a redirect. 533 534 This is called by the http_error_30x methods when a 535 redirection response is received. If a redirection should 536 take place, return a new Request to allow http_error_30x to 537 perform the redirect. Otherwise, raise HTTPError if no-one 538 else should try to handle this url. Return None if you can't 539 but another Handler might. 540 """ 541 m = req.get_method() 542 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 543 or code in (301, 302, 303) and m == "POST"): 544 # Strictly (according to RFC 2616), 301 or 302 in response 545 # to a POST MUST NOT cause a redirection without confirmation 546 # from the user (of urllib2, in this case). In practice, 547 # essentially all clients do redirect in this case, so we 548 # do the same. 549 # be conciliant with URIs containing a space 550 newurl = newurl.replace(' ', '%20') 551 newheaders = dict((k,v) for k,v in req.headers.items() 552 if k.lower() not in ("content-length", "content-type") 553 ) 554 return Request(newurl, 555 headers=newheaders, 556 origin_req_host=req.get_origin_req_host(), 557 unverifiable=True) 558 else: 559 raise HTTPError(req.get_full_url(), code, msg, headers, fp) 560 561 # Implementation note: To avoid the server sending us into an 562 # infinite loop, the request object needs to track what URLs we 563 # have already seen. Do this by adding a handler-specific 564 # attribute to the Request object. 565 def http_error_302(self, req, fp, code, msg, headers): 566 # Some servers (incorrectly) return multiple Location headers 567 # (so probably same goes for URI). Use first header. 568 if 'location' in headers: 569 newurl = headers.getheaders('location')[0] 570 elif 'uri' in headers: 571 newurl = headers.getheaders('uri')[0] 572 else: 573 return 574 575 # fix a possible malformed URL 576 urlparts = urlparse.urlparse(newurl) 577 if not urlparts.path: 578 urlparts = list(urlparts) 579 urlparts[2] = "/" 580 newurl = urlparse.urlunparse(urlparts) 581 582 newurl = urlparse.urljoin(req.get_full_url(), newurl) 583 584 # For security reasons we do not allow redirects to protocols 585 # other than HTTP, HTTPS or FTP. 586 newurl_lower = newurl.lower() 587 if not (newurl_lower.startswith('http://') or 588 newurl_lower.startswith('https://') or 589 newurl_lower.startswith('ftp://')): 590 raise HTTPError(newurl, code, 591 msg + " - Redirection to url '%s' is not allowed" % 592 newurl, 593 headers, fp) 594 595 # XXX Probably want to forget about the state of the current 596 # request, although that might interact poorly with other 597 # handlers that also use handler-specific request attributes 598 new = self.redirect_request(req, fp, code, msg, headers, newurl) 599 if new is None: 600 return 601 602 # loop detection 603 # .redirect_dict has a key url if url was previously visited. 604 if hasattr(req, 'redirect_dict'): 605 visited = new.redirect_dict = req.redirect_dict 606 if (visited.get(newurl, 0) >= self.max_repeats or 607 len(visited) >= self.max_redirections): 608 raise HTTPError(req.get_full_url(), code, 609 self.inf_msg + msg, headers, fp) 610 else: 611 visited = new.redirect_dict = req.redirect_dict = {} 612 visited[newurl] = visited.get(newurl, 0) + 1 613 614 # Don't close the fp until we are sure that we won't use it 615 # with HTTPError. 616 fp.read() 617 fp.close() 618 619 return self.parent.open(new, timeout=req.timeout) 620 621 http_error_301 = http_error_303 = http_error_307 = http_error_302 622 623 inf_msg = "The HTTP server returned a redirect error that would " \ 624 "lead to an infinite loop.\n" \ 625 "The last 30x error message was:\n" 626 627 628 def _parse_proxy(proxy): 629 """Return (scheme, user, password, host/port) given a URL or an authority. 630 631 If a URL is supplied, it must have an authority (host:port) component. 632 According to RFC 3986, having an authority component means the URL must 633 have two slashes after the scheme: 634 635 >>> _parse_proxy('file:/ftp.example.com/') 636 Traceback (most recent call last): 637 ValueError: proxy URL with no authority: 'file:/ftp.example.com/' 638 639 The first three items of the returned tuple may be None. 640 641 Examples of authority parsing: 642 643 >>> _parse_proxy('proxy.example.com') 644 (None, None, None, 'proxy.example.com') 645 >>> _parse_proxy('proxy.example.com:3128') 646 (None, None, None, 'proxy.example.com:3128') 647 648 The authority component may optionally include userinfo (assumed to be 649 username:password): 650 651 >>> _parse_proxy('joe:password (at] proxy.example.com') 652 (None, 'joe', 'password', 'proxy.example.com') 653 >>> _parse_proxy('joe:password (at] proxy.example.com:3128') 654 (None, 'joe', 'password', 'proxy.example.com:3128') 655 656 Same examples, but with URLs instead: 657 658 >>> _parse_proxy('http://proxy.example.com/') 659 ('http', None, None, 'proxy.example.com') 660 >>> _parse_proxy('http://proxy.example.com:3128/') 661 ('http', None, None, 'proxy.example.com:3128') 662 >>> _parse_proxy('http://joe:password@proxy.example.com/') 663 ('http', 'joe', 'password', 'proxy.example.com') 664 >>> _parse_proxy('http://joe:password@proxy.example.com:3128') 665 ('http', 'joe', 'password', 'proxy.example.com:3128') 666 667 Everything after the authority is ignored: 668 669 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') 670 ('ftp', 'joe', 'password', 'proxy.example.com') 671 672 Test for no trailing '/' case: 673 674 >>> _parse_proxy('http://joe:password@proxy.example.com') 675 ('http', 'joe', 'password', 'proxy.example.com') 676 677 """ 678 scheme, r_scheme = splittype(proxy) 679 if not r_scheme.startswith("/"): 680 # authority 681 scheme = None 682 authority = proxy 683 else: 684 # URL 685 if not r_scheme.startswith("//"): 686 raise ValueError("proxy URL with no authority: %r" % proxy) 687 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 688 # and 3.3.), path is empty or starts with '/' 689 end = r_scheme.find("/", 2) 690 if end == -1: 691 end = None 692 authority = r_scheme[2:end] 693 userinfo, hostport = splituser(authority) 694 if userinfo is not None: 695 user, password = splitpasswd(userinfo) 696 else: 697 user = password = None 698 return scheme, user, password, hostport 699 700 class ProxyHandler(BaseHandler): 701 # Proxies must be in front 702 handler_order = 100 703 704 def __init__(self, proxies=None): 705 if proxies is None: 706 proxies = getproxies() 707 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" 708 self.proxies = proxies 709 for type, url in proxies.items(): 710 setattr(self, '%s_open' % type, 711 lambda r, proxy=url, type=type, meth=self.proxy_open: \ 712 meth(r, proxy, type)) 713 714 def proxy_open(self, req, proxy, type): 715 orig_type = req.get_type() 716 proxy_type, user, password, hostport = _parse_proxy(proxy) 717 718 if proxy_type is None: 719 proxy_type = orig_type 720 721 if req.host and proxy_bypass(req.host): 722 return None 723 724 if user and password: 725 user_pass = '%s:%s' % (unquote(user), unquote(password)) 726 creds = base64.b64encode(user_pass).strip() 727 req.add_header('Proxy-authorization', 'Basic ' + creds) 728 hostport = unquote(hostport) 729 req.set_proxy(hostport, proxy_type) 730 731 if orig_type == proxy_type or orig_type == 'https': 732 # let other handlers take care of it 733 return None 734 else: 735 # need to start over, because the other handlers don't 736 # grok the proxy's URL type 737 # e.g. if we have a constructor arg proxies like so: 738 # {'http': 'ftp://proxy.example.com'}, we may end up turning 739 # a request for http://acme.example.com/a into one for 740 # ftp://proxy.example.com/a 741 return self.parent.open(req, timeout=req.timeout) 742 743 class HTTPPasswordMgr: 744 745 def __init__(self): 746 self.passwd = {} 747 748 def add_password(self, realm, uri, user, passwd): 749 # uri could be a single URI or a sequence 750 if isinstance(uri, basestring): 751 uri = [uri] 752 if not realm in self.passwd: 753 self.passwd[realm] = {} 754 for default_port in True, False: 755 reduced_uri = tuple( 756 [self.reduce_uri(u, default_port) for u in uri]) 757 self.passwd[realm][reduced_uri] = (user, passwd) 758 759 def find_user_password(self, realm, authuri): 760 domains = self.passwd.get(realm, {}) 761 for default_port in True, False: 762 reduced_authuri = self.reduce_uri(authuri, default_port) 763 for uris, authinfo in domains.iteritems(): 764 for uri in uris: 765 if self.is_suburi(uri, reduced_authuri): 766 return authinfo 767 return None, None 768 769 def reduce_uri(self, uri, default_port=True): 770 """Accept authority or URI and extract only the authority and path.""" 771 # note HTTP URLs do not have a userinfo component 772 parts = urlparse.urlsplit(uri) 773 if parts[1]: 774 # URI 775 scheme = parts[0] 776 authority = parts[1] 777 path = parts[2] or '/' 778 else: 779 # host or host:port 780 scheme = None 781 authority = uri 782 path = '/' 783 host, port = splitport(authority) 784 if default_port and port is None and scheme is not None: 785 dport = {"http": 80, 786 "https": 443, 787 }.get(scheme) 788 if dport is not None: 789 authority = "%s:%d" % (host, dport) 790 return authority, path 791 792 def is_suburi(self, base, test): 793 """Check if test is below base in a URI tree 794 795 Both args must be URIs in reduced form. 796 """ 797 if base == test: 798 return True 799 if base[0] != test[0]: 800 return False 801 common = posixpath.commonprefix((base[1], test[1])) 802 if len(common) == len(base[1]): 803 return True 804 return False 805 806 807 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 808 809 def find_user_password(self, realm, authuri): 810 user, password = HTTPPasswordMgr.find_user_password(self, realm, 811 authuri) 812 if user is not None: 813 return user, password 814 return HTTPPasswordMgr.find_user_password(self, None, authuri) 815 816 817 class AbstractBasicAuthHandler: 818 819 # XXX this allows for multiple auth-schemes, but will stupidly pick 820 # the last one with a realm specified. 821 822 # allow for double- and single-quoted realm values 823 # (single quotes are a violation of the RFC, but appear in the wild) 824 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 825 'realm=(["\'])(.*?)\\2', re.I) 826 827 # XXX could pre-emptively send auth info already accepted (RFC 2617, 828 # end of section 2, and section 1.2 immediately after "credentials" 829 # production). 830 831 def __init__(self, password_mgr=None): 832 if password_mgr is None: 833 password_mgr = HTTPPasswordMgr() 834 self.passwd = password_mgr 835 self.add_password = self.passwd.add_password 836 self.retried = 0 837 838 def reset_retry_count(self): 839 self.retried = 0 840 841 def http_error_auth_reqed(self, authreq, host, req, headers): 842 # host may be an authority (without userinfo) or a URL with an 843 # authority 844 # XXX could be multiple headers 845 authreq = headers.get(authreq, None) 846 847 if self.retried > 5: 848 # retry sending the username:password 5 times before failing. 849 raise HTTPError(req.get_full_url(), 401, "basic auth failed", 850 headers, None) 851 else: 852 self.retried += 1 853 854 if authreq: 855 mo = AbstractBasicAuthHandler.rx.search(authreq) 856 if mo: 857 scheme, quote, realm = mo.groups() 858 if scheme.lower() == 'basic': 859 response = self.retry_http_basic_auth(host, req, realm) 860 if response and response.code != 401: 861 self.retried = 0 862 return response 863 864 def retry_http_basic_auth(self, host, req, realm): 865 user, pw = self.passwd.find_user_password(realm, host) 866 if pw is not None: 867 raw = "%s:%s" % (user, pw) 868 auth = 'Basic %s' % base64.b64encode(raw).strip() 869 if req.headers.get(self.auth_header, None) == auth: 870 return None 871 req.add_unredirected_header(self.auth_header, auth) 872 return self.parent.open(req, timeout=req.timeout) 873 else: 874 return None 875 876 877 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 878 879 auth_header = 'Authorization' 880 881 def http_error_401(self, req, fp, code, msg, headers): 882 url = req.get_full_url() 883 response = self.http_error_auth_reqed('www-authenticate', 884 url, req, headers) 885 self.reset_retry_count() 886 return response 887 888 889 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 890 891 auth_header = 'Proxy-authorization' 892 893 def http_error_407(self, req, fp, code, msg, headers): 894 # http_error_auth_reqed requires that there is no userinfo component in 895 # authority. Assume there isn't one, since urllib2 does not (and 896 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 897 # userinfo. 898 authority = req.get_host() 899 response = self.http_error_auth_reqed('proxy-authenticate', 900 authority, req, headers) 901 self.reset_retry_count() 902 return response 903 904 905 def randombytes(n): 906 """Return n random bytes.""" 907 # Use /dev/urandom if it is available. Fall back to random module 908 # if not. It might be worthwhile to extend this function to use 909 # other platform-specific mechanisms for getting random bytes. 910 if os.path.exists("/dev/urandom"): 911 f = open("/dev/urandom") 912 s = f.read(n) 913 f.close() 914 return s 915 else: 916 L = [chr(random.randrange(0, 256)) for i in range(n)] 917 return "".join(L) 918 919 class AbstractDigestAuthHandler: 920 # Digest authentication is specified in RFC 2617. 921 922 # XXX The client does not inspect the Authentication-Info header 923 # in a successful response. 924 925 # XXX It should be possible to test this implementation against 926 # a mock server that just generates a static set of challenges. 927 928 # XXX qop="auth-int" supports is shaky 929 930 def __init__(self, passwd=None): 931 if passwd is None: 932 passwd = HTTPPasswordMgr() 933 self.passwd = passwd 934 self.add_password = self.passwd.add_password 935 self.retried = 0 936 self.nonce_count = 0 937 self.last_nonce = None 938 939 def reset_retry_count(self): 940 self.retried = 0 941 942 def http_error_auth_reqed(self, auth_header, host, req, headers): 943 authreq = headers.get(auth_header, None) 944 if self.retried > 5: 945 # Don't fail endlessly - if we failed once, we'll probably 946 # fail a second time. Hm. Unless the Password Manager is 947 # prompting for the information. Crap. This isn't great 948 # but it's better than the current 'repeat until recursion 949 # depth exceeded' approach <wink> 950 raise HTTPError(req.get_full_url(), 401, "digest auth failed", 951 headers, None) 952 else: 953 self.retried += 1 954 if authreq: 955 scheme = authreq.split()[0] 956 if scheme.lower() == 'digest': 957 return self.retry_http_digest_auth(req, authreq) 958 959 def retry_http_digest_auth(self, req, auth): 960 token, challenge = auth.split(' ', 1) 961 chal = parse_keqv_list(parse_http_list(challenge)) 962 auth = self.get_authorization(req, chal) 963 if auth: 964 auth_val = 'Digest %s' % auth 965 if req.headers.get(self.auth_header, None) == auth_val: 966 return None 967 req.add_unredirected_header(self.auth_header, auth_val) 968 resp = self.parent.open(req, timeout=req.timeout) 969 return resp 970 971 def get_cnonce(self, nonce): 972 # The cnonce-value is an opaque 973 # quoted string value provided by the client and used by both client 974 # and server to avoid chosen plaintext attacks, to provide mutual 975 # authentication, and to provide some message integrity protection. 976 # This isn't a fabulous effort, but it's probably Good Enough. 977 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), 978 randombytes(8))).hexdigest() 979 return dig[:16] 980 981 def get_authorization(self, req, chal): 982 try: 983 realm = chal['realm'] 984 nonce = chal['nonce'] 985 qop = chal.get('qop') 986 algorithm = chal.get('algorithm', 'MD5') 987 # mod_digest doesn't send an opaque, even though it isn't 988 # supposed to be optional 989 opaque = chal.get('opaque', None) 990 except KeyError: 991 return None 992 993 H, KD = self.get_algorithm_impls(algorithm) 994 if H is None: 995 return None 996 997 user, pw = self.passwd.find_user_password(realm, req.get_full_url()) 998 if user is None: 999 return None 1000 1001 # XXX not implemented yet 1002 if req.has_data(): 1003 entdig = self.get_entity_digest(req.get_data(), chal) 1004 else: 1005 entdig = None 1006 1007 A1 = "%s:%s:%s" % (user, realm, pw) 1008 A2 = "%s:%s" % (req.get_method(), 1009 # XXX selector: what about proxies and full urls 1010 req.get_selector()) 1011 if qop == 'auth': 1012 if nonce == self.last_nonce: 1013 self.nonce_count += 1 1014 else: 1015 self.nonce_count = 1 1016 self.last_nonce = nonce 1017 1018 ncvalue = '%08x' % self.nonce_count 1019 cnonce = self.get_cnonce(nonce) 1020 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 1021 respdig = KD(H(A1), noncebit) 1022 elif qop is None: 1023 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1024 else: 1025 # XXX handle auth-int. 1026 raise URLError("qop '%s' is not supported." % qop) 1027 1028 # XXX should the partial digests be encoded too? 1029 1030 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1031 'response="%s"' % (user, realm, nonce, req.get_selector(), 1032 respdig) 1033 if opaque: 1034 base += ', opaque="%s"' % opaque 1035 if entdig: 1036 base += ', digest="%s"' % entdig 1037 base += ', algorithm="%s"' % algorithm 1038 if qop: 1039 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1040 return base 1041 1042 def get_algorithm_impls(self, algorithm): 1043 # algorithm should be case-insensitive according to RFC2617 1044 algorithm = algorithm.upper() 1045 # lambdas assume digest modules are imported at the top level 1046 if algorithm == 'MD5': 1047 H = lambda x: hashlib.md5(x).hexdigest() 1048 elif algorithm == 'SHA': 1049 H = lambda x: hashlib.sha1(x).hexdigest() 1050 # XXX MD5-sess 1051 KD = lambda s, d: H("%s:%s" % (s, d)) 1052 return H, KD 1053 1054 def get_entity_digest(self, data, chal): 1055 # XXX not implemented yet 1056 return None 1057 1058 1059 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1060 """An authentication protocol defined by RFC 2069 1061 1062 Digest authentication improves on basic authentication because it 1063 does not transmit passwords in the clear. 1064 """ 1065 1066 auth_header = 'Authorization' 1067 handler_order = 490 # before Basic auth 1068 1069 def http_error_401(self, req, fp, code, msg, headers): 1070 host = urlparse.urlparse(req.get_full_url())[1] 1071 retry = self.http_error_auth_reqed('www-authenticate', 1072 host, req, headers) 1073 self.reset_retry_count() 1074 return retry 1075 1076 1077 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1078 1079 auth_header = 'Proxy-Authorization' 1080 handler_order = 490 # before Basic auth 1081 1082 def http_error_407(self, req, fp, code, msg, headers): 1083 host = req.get_host() 1084 retry = self.http_error_auth_reqed('proxy-authenticate', 1085 host, req, headers) 1086 self.reset_retry_count() 1087 return retry 1088 1089 class AbstractHTTPHandler(BaseHandler): 1090 1091 def __init__(self, debuglevel=0): 1092 self._debuglevel = debuglevel 1093 1094 def set_http_debuglevel(self, level): 1095 self._debuglevel = level 1096 1097 def do_request_(self, request): 1098 host = request.get_host() 1099 if not host: 1100 raise URLError('no host given') 1101 1102 if request.has_data(): # POST 1103 data = request.get_data() 1104 if not request.has_header('Content-type'): 1105 request.add_unredirected_header( 1106 'Content-type', 1107 'application/x-www-form-urlencoded') 1108 if not request.has_header('Content-length'): 1109 request.add_unredirected_header( 1110 'Content-length', '%d' % len(data)) 1111 1112 sel_host = host 1113 if request.has_proxy(): 1114 scheme, sel = splittype(request.get_selector()) 1115 sel_host, sel_path = splithost(sel) 1116 1117 if not request.has_header('Host'): 1118 request.add_unredirected_header('Host', sel_host) 1119 for name, value in self.parent.addheaders: 1120 name = name.capitalize() 1121 if not request.has_header(name): 1122 request.add_unredirected_header(name, value) 1123 1124 return request 1125 1126 def do_open(self, http_class, req): 1127 """Return an addinfourl object for the request, using http_class. 1128 1129 http_class must implement the HTTPConnection API from httplib. 1130 The addinfourl return value is a file-like object. It also 1131 has methods and attributes including: 1132 - info(): return a mimetools.Message object for the headers 1133 - geturl(): return the original request URL 1134 - code: HTTP status code 1135 """ 1136 host = req.get_host() 1137 if not host: 1138 raise URLError('no host given') 1139 1140 h = http_class(host, timeout=req.timeout) # will parse host:port 1141 h.set_debuglevel(self._debuglevel) 1142 1143 headers = dict(req.unredirected_hdrs) 1144 headers.update(dict((k, v) for k, v in req.headers.items() 1145 if k not in headers)) 1146 1147 # We want to make an HTTP/1.1 request, but the addinfourl 1148 # class isn't prepared to deal with a persistent connection. 1149 # It will try to read all remaining data from the socket, 1150 # which will block while the server waits for the next request. 1151 # So make sure the connection gets closed after the (only) 1152 # request. 1153 headers["Connection"] = "close" 1154 headers = dict( 1155 (name.title(), val) for name, val in headers.items()) 1156 1157 if req._tunnel_host: 1158 tunnel_headers = {} 1159 proxy_auth_hdr = "Proxy-Authorization" 1160 if proxy_auth_hdr in headers: 1161 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1162 # Proxy-Authorization should not be sent to origin 1163 # server. 1164 del headers[proxy_auth_hdr] 1165 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1166 1167 try: 1168 h.request(req.get_method(), req.get_selector(), req.data, headers) 1169 try: 1170 r = h.getresponse(buffering=True) 1171 except TypeError: #buffering kw not supported 1172 r = h.getresponse() 1173 except socket.error, err: # XXX what error? 1174 raise URLError(err) 1175 1176 # Pick apart the HTTPResponse object to get the addinfourl 1177 # object initialized properly. 1178 1179 # Wrap the HTTPResponse object in socket's file object adapter 1180 # for Windows. That adapter calls recv(), so delegate recv() 1181 # to read(). This weird wrapping allows the returned object to 1182 # have readline() and readlines() methods. 1183 1184 # XXX It might be better to extract the read buffering code 1185 # out of socket._fileobject() and into a base class. 1186 1187 r.recv = r.read 1188 fp = socket._fileobject(r, close=True) 1189 1190 resp = addinfourl(fp, r.msg, req.get_full_url()) 1191 resp.code = r.status 1192 resp.msg = r.reason 1193 return resp 1194 1195 1196 class HTTPHandler(AbstractHTTPHandler): 1197 1198 def http_open(self, req): 1199 return self.do_open(httplib.HTTPConnection, req) 1200 1201 http_request = AbstractHTTPHandler.do_request_ 1202 1203 if hasattr(httplib, 'HTTPS'): 1204 class HTTPSHandler(AbstractHTTPHandler): 1205 1206 def https_open(self, req): 1207 return self.do_open(httplib.HTTPSConnection, req) 1208 1209 https_request = AbstractHTTPHandler.do_request_ 1210 1211 class HTTPCookieProcessor(BaseHandler): 1212 def __init__(self, cookiejar=None): 1213 import cookielib 1214 if cookiejar is None: 1215 cookiejar = cookielib.CookieJar() 1216 self.cookiejar = cookiejar 1217 1218 def http_request(self, request): 1219 self.cookiejar.add_cookie_header(request) 1220 return request 1221 1222 def http_response(self, request, response): 1223 self.cookiejar.extract_cookies(response, request) 1224 return response 1225 1226 https_request = http_request 1227 https_response = http_response 1228 1229 class UnknownHandler(BaseHandler): 1230 def unknown_open(self, req): 1231 type = req.get_type() 1232 raise URLError('unknown url type: %s' % type) 1233 1234 def parse_keqv_list(l): 1235 """Parse list of key=value strings where keys are not duplicated.""" 1236 parsed = {} 1237 for elt in l: 1238 k, v = elt.split('=', 1) 1239 if v[0] == '"' and v[-1] == '"': 1240 v = v[1:-1] 1241 parsed[k] = v 1242 return parsed 1243 1244 def parse_http_list(s): 1245 """Parse lists as described by RFC 2068 Section 2. 1246 1247 In particular, parse comma-separated lists where the elements of 1248 the list may include quoted-strings. A quoted-string could 1249 contain a comma. A non-quoted string could have quotes in the 1250 middle. Neither commas nor quotes count if they are escaped. 1251 Only double-quotes count, not single-quotes. 1252 """ 1253 res = [] 1254 part = '' 1255 1256 escape = quote = False 1257 for cur in s: 1258 if escape: 1259 part += cur 1260 escape = False 1261 continue 1262 if quote: 1263 if cur == '\\': 1264 escape = True 1265 continue 1266 elif cur == '"': 1267 quote = False 1268 part += cur 1269 continue 1270 1271 if cur == ',': 1272 res.append(part) 1273 part = '' 1274 continue 1275 1276 if cur == '"': 1277 quote = True 1278 1279 part += cur 1280 1281 # append last part 1282 if part: 1283 res.append(part) 1284 1285 return [part.strip() for part in res] 1286 1287 def _safe_gethostbyname(host): 1288 try: 1289 return socket.gethostbyname(host) 1290 except socket.gaierror: 1291 return None 1292 1293 class FileHandler(BaseHandler): 1294 # Use local file or FTP depending on form of URL 1295 def file_open(self, req): 1296 url = req.get_selector() 1297 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1298 req.host != 'localhost'): 1299 req.type = 'ftp' 1300 return self.parent.open(req) 1301 else: 1302 return self.open_local_file(req) 1303 1304 # names for the localhost 1305 names = None 1306 def get_names(self): 1307 if FileHandler.names is None: 1308 try: 1309 FileHandler.names = tuple( 1310 socket.gethostbyname_ex('localhost')[2] + 1311 socket.gethostbyname_ex(socket.gethostname())[2]) 1312 except socket.gaierror: 1313 FileHandler.names = (socket.gethostbyname('localhost'),) 1314 return FileHandler.names 1315 1316 # not entirely sure what the rules are here 1317 def open_local_file(self, req): 1318 import email.utils 1319 import mimetypes 1320 host = req.get_host() 1321 filename = req.get_selector() 1322 localfile = url2pathname(filename) 1323 try: 1324 stats = os.stat(localfile) 1325 size = stats.st_size 1326 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1327 mtype = mimetypes.guess_type(filename)[0] 1328 headers = mimetools.Message(StringIO( 1329 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1330 (mtype or 'text/plain', size, modified))) 1331 if host: 1332 host, port = splitport(host) 1333 if not host or \ 1334 (not port and _safe_gethostbyname(host) in self.get_names()): 1335 if host: 1336 origurl = 'file://' + host + filename 1337 else: 1338 origurl = 'file://' + filename 1339 return addinfourl(open(localfile, 'rb'), headers, origurl) 1340 except OSError, msg: 1341 # urllib2 users shouldn't expect OSErrors coming from urlopen() 1342 raise URLError(msg) 1343 raise URLError('file not on local host') 1344 1345 class FTPHandler(BaseHandler): 1346 def ftp_open(self, req): 1347 import ftplib 1348 import mimetypes 1349 host = req.get_host() 1350 if not host: 1351 raise URLError('ftp error: no host given') 1352 host, port = splitport(host) 1353 if port is None: 1354 port = ftplib.FTP_PORT 1355 else: 1356 port = int(port) 1357 1358 # username/password handling 1359 user, host = splituser(host) 1360 if user: 1361 user, passwd = splitpasswd(user) 1362 else: 1363 passwd = None 1364 host = unquote(host) 1365 user = user or '' 1366 passwd = passwd or '' 1367 1368 try: 1369 host = socket.gethostbyname(host) 1370 except socket.error, msg: 1371 raise URLError(msg) 1372 path, attrs = splitattr(req.get_selector()) 1373 dirs = path.split('/') 1374 dirs = map(unquote, dirs) 1375 dirs, file = dirs[:-1], dirs[-1] 1376 if dirs and not dirs[0]: 1377 dirs = dirs[1:] 1378 try: 1379 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1380 type = file and 'I' or 'D' 1381 for attr in attrs: 1382 attr, value = splitvalue(attr) 1383 if attr.lower() == 'type' and \ 1384 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1385 type = value.upper() 1386 fp, retrlen = fw.retrfile(file, type) 1387 headers = "" 1388 mtype = mimetypes.guess_type(req.get_full_url())[0] 1389 if mtype: 1390 headers += "Content-type: %s\n" % mtype 1391 if retrlen is not None and retrlen >= 0: 1392 headers += "Content-length: %d\n" % retrlen 1393 sf = StringIO(headers) 1394 headers = mimetools.Message(sf) 1395 return addinfourl(fp, headers, req.get_full_url()) 1396 except ftplib.all_errors, msg: 1397 raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] 1398 1399 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1400 fw = ftpwrapper(user, passwd, host, port, dirs, timeout) 1401 ## fw.ftp.set_debuglevel(1) 1402 return fw 1403 1404 class CacheFTPHandler(FTPHandler): 1405 # XXX would be nice to have pluggable cache strategies 1406 # XXX this stuff is definitely not thread safe 1407 def __init__(self): 1408 self.cache = {} 1409 self.timeout = {} 1410 self.soonest = 0 1411 self.delay = 60 1412 self.max_conns = 16 1413 1414 def setTimeout(self, t): 1415 self.delay = t 1416 1417 def setMaxConns(self, m): 1418 self.max_conns = m 1419 1420 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1421 key = user, host, port, '/'.join(dirs), timeout 1422 if key in self.cache: 1423 self.timeout[key] = time.time() + self.delay 1424 else: 1425 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) 1426 self.timeout[key] = time.time() + self.delay 1427 self.check_cache() 1428 return self.cache[key] 1429 1430 def check_cache(self): 1431 # first check for old ones 1432 t = time.time() 1433 if self.soonest <= t: 1434 for k, v in self.timeout.items(): 1435 if v < t: 1436 self.cache[k].close() 1437 del self.cache[k] 1438 del self.timeout[k] 1439 self.soonest = min(self.timeout.values()) 1440 1441 # then check the size 1442 if len(self.cache) == self.max_conns: 1443 for k, v in self.timeout.items(): 1444 if v == self.soonest: 1445 del self.cache[k] 1446 del self.timeout[k] 1447 break 1448 self.soonest = min(self.timeout.values()) 1449