1 """An extensible library for opening URLs using a variety of protocols 2 3 The simplest way to use this module is to call the urlopen function, 4 which accepts a string containing a URL or a Request object (described 5 below). It opens the URL and returns the results as file-like 6 object; the returned object has some extra methods described below. 7 8 The OpenerDirector manages a collection of Handler objects that do 9 all the actual work. Each Handler implements a particular protocol or 10 option. The OpenerDirector is a composite object that invokes the 11 Handlers needed to open the requested URL. For example, the 12 HTTPHandler performs HTTP GET and POST requests and deals with 13 non-error returns. The HTTPRedirectHandler automatically deals with 14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler 15 deals with digest authentication. 16 17 urlopen(url, data=None) -- Basic usage is the same as original 18 urllib. pass the url and optionally data to post to an HTTP URL, and 19 get a file-like object back. One difference is that you can also pass 20 a Request instance instead of URL. Raises a URLError (subclass of 21 OSError); for HTTP errors, raises an HTTPError, which can also be 22 treated as a valid response. 23 24 build_opener -- Function that creates a new OpenerDirector instance. 25 Will install the default handlers. Accepts one or more Handlers as 26 arguments, either instances or Handler classes that it will 27 instantiate. If one of the argument is a subclass of the default 28 handler, the argument will be installed instead of the default. 29 30 install_opener -- Installs a new opener as the default opener. 31 32 objects of interest: 33 34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages 35 the Handler classes, while dealing with requests and responses. 36 37 Request -- An object that encapsulates the state of a request. The 38 state can be as simple as the URL. It can also include extra HTTP 39 headers, e.g. a User-Agent. 40 41 BaseHandler -- 42 43 internals: 44 BaseHandler and parent 45 _call_chain conventions 46 47 Example usage: 48 49 import urllib.request 50 51 # set up authentication info 52 authinfo = urllib.request.HTTPBasicAuthHandler() 53 authinfo.add_password(realm='PDQ Application', 54 uri='https://mahler:8092/site-updates.py', 55 user='klem', 56 passwd='geheim$parole') 57 58 proxy_support = urllib.request.ProxyHandler({"http" : "http://ahad-haam:3128"}) 59 60 # build a new opener that adds authentication and caching FTP handlers 61 opener = urllib.request.build_opener(proxy_support, authinfo, 62 urllib.request.CacheFTPHandler) 63 64 # install it 65 urllib.request.install_opener(opener) 66 67 f = urllib.request.urlopen('http://www.python.org/') 68 """ 69 70 # XXX issues: 71 # If an authentication error handler that tries to perform 72 # authentication for some reason but fails, how should the error be 73 # signalled? The client needs to know the HTTP error code. But if 74 # the handler knows that the problem was, e.g., that it didn't know 75 # that hash algo that requested in the challenge, it would be good to 76 # pass that information along to the client, too. 77 # ftp errors aren't handled cleanly 78 # check digest against correct (i.e. non-apache) implementation 79 80 # Possible extensions: 81 # complex proxies XXX not sure what exactly was meant by this 82 # abstract factory for opener 83 84 import base64 85 import bisect 86 import email 87 import hashlib 88 import http.client 89 import io 90 import os 91 import posixpath 92 import re 93 import socket 94 import string 95 import sys 96 import time 97 import collections 98 import tempfile 99 import contextlib 100 import warnings 101 102 103 from urllib.error import URLError, HTTPError, ContentTooShortError 104 from urllib.parse import ( 105 urlparse, urlsplit, urljoin, unwrap, quote, unquote, 106 splittype, splithost, splitport, splituser, splitpasswd, 107 splitattr, splitquery, splitvalue, splittag, to_bytes, 108 unquote_to_bytes, urlunparse) 109 from urllib.response import addinfourl, addclosehook 110 111 # check for SSL 112 try: 113 import ssl 114 except ImportError: 115 _have_ssl = False 116 else: 117 _have_ssl = True 118 119 __all__ = [ 120 # Classes 121 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 122 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 123 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 124 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', 125 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 126 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', 127 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 128 'UnknownHandler', 'HTTPErrorProcessor', 129 # Functions 130 'urlopen', 'install_opener', 'build_opener', 131 'pathname2url', 'url2pathname', 'getproxies', 132 # Legacy interface 133 'urlretrieve', 'urlcleanup', 'URLopener', 'FancyURLopener', 134 ] 135 136 # used in User-Agent header sent 137 __version__ = '%d.%d' % sys.version_info[:2] 138 139 _opener = None 140 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 141 *, cafile=None, capath=None, cadefault=False, context=None): 142 '''Open the URL url, which can be either a string or a Request object. 143 144 *data* must be an object specifying additional data to be sent to 145 the server, or None if no such data is needed. See Request for 146 details. 147 148 urllib.request module uses HTTP/1.1 and includes a "Connection:close" 149 header in its HTTP requests. 150 151 The optional *timeout* parameter specifies a timeout in seconds for 152 blocking operations like the connection attempt (if not specified, the 153 global default timeout setting will be used). This only works for HTTP, 154 HTTPS and FTP connections. 155 156 If *context* is specified, it must be a ssl.SSLContext instance describing 157 the various SSL options. See HTTPSConnection for more details. 158 159 The optional *cafile* and *capath* parameters specify a set of trusted CA 160 certificates for HTTPS requests. cafile should point to a single file 161 containing a bundle of CA certificates, whereas capath should point to a 162 directory of hashed certificate files. More information can be found in 163 ssl.SSLContext.load_verify_locations(). 164 165 The *cadefault* parameter is ignored. 166 167 This function always returns an object which can work as a context 168 manager and has methods such as 169 170 * geturl() - return the URL of the resource retrieved, commonly used to 171 determine if a redirect was followed 172 173 * info() - return the meta-information of the page, such as headers, in the 174 form of an email.message_from_string() instance (see Quick Reference to 175 HTTP Headers) 176 177 * getcode() - return the HTTP status code of the response. Raises URLError 178 on errors. 179 180 For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse 181 object slightly modified. In addition to the three new methods above, the 182 msg attribute contains the same information as the reason attribute --- 183 the reason phrase returned by the server --- instead of the response 184 headers as it is specified in the documentation for HTTPResponse. 185 186 For FTP, file, and data URLs and requests explicitly handled by legacy 187 URLopener and FancyURLopener classes, this function returns a 188 urllib.response.addinfourl object. 189 190 Note that None may be returned if no handler handles the request (though 191 the default installed global OpenerDirector uses UnknownHandler to ensure 192 this never happens). 193 194 In addition, if proxy settings are detected (for example, when a *_proxy 195 environment variable like http_proxy is set), ProxyHandler is default 196 installed and makes sure the requests are handled through the proxy. 197 198 ''' 199 global _opener 200 if cafile or capath or cadefault: 201 import warnings 202 warnings.warn("cafile, cpath and cadefault are deprecated, use a " 203 "custom context instead.", DeprecationWarning, 2) 204 if context is not None: 205 raise ValueError( 206 "You can't pass both context and any of cafile, capath, and " 207 "cadefault" 208 ) 209 if not _have_ssl: 210 raise ValueError('SSL support not available') 211 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH, 212 cafile=cafile, 213 capath=capath) 214 https_handler = HTTPSHandler(context=context) 215 opener = build_opener(https_handler) 216 elif context: 217 https_handler = HTTPSHandler(context=context) 218 opener = build_opener(https_handler) 219 elif _opener is None: 220 _opener = opener = build_opener() 221 else: 222 opener = _opener 223 return opener.open(url, data, timeout) 224 225 def install_opener(opener): 226 global _opener 227 _opener = opener 228 229 _url_tempfiles = [] 230 def urlretrieve(url, filename=None, reporthook=None, data=None): 231 """ 232 Retrieve a URL into a temporary location on disk. 233 234 Requires a URL argument. If a filename is passed, it is used as 235 the temporary file location. The reporthook argument should be 236 a callable that accepts a block number, a read size, and the 237 total file size of the URL target. The data argument should be 238 valid URL encoded data. 239 240 If a filename is passed and the URL points to a local resource, 241 the result is a copy from local file to new file. 242 243 Returns a tuple containing the path to the newly created 244 data file as well as the resulting HTTPMessage object. 245 """ 246 url_type, path = splittype(url) 247 248 with contextlib.closing(urlopen(url, data)) as fp: 249 headers = fp.info() 250 251 # Just return the local path and the "headers" for file:// 252 # URLs. No sense in performing a copy unless requested. 253 if url_type == "file" and not filename: 254 return os.path.normpath(path), headers 255 256 # Handle temporary file setup. 257 if filename: 258 tfp = open(filename, 'wb') 259 else: 260 tfp = tempfile.NamedTemporaryFile(delete=False) 261 filename = tfp.name 262 _url_tempfiles.append(filename) 263 264 with tfp: 265 result = filename, headers 266 bs = 1024*8 267 size = -1 268 read = 0 269 blocknum = 0 270 if "content-length" in headers: 271 size = int(headers["Content-Length"]) 272 273 if reporthook: 274 reporthook(blocknum, bs, size) 275 276 while True: 277 block = fp.read(bs) 278 if not block: 279 break 280 read += len(block) 281 tfp.write(block) 282 blocknum += 1 283 if reporthook: 284 reporthook(blocknum, bs, size) 285 286 if size >= 0 and read < size: 287 raise ContentTooShortError( 288 "retrieval incomplete: got only %i out of %i bytes" 289 % (read, size), result) 290 291 return result 292 293 def urlcleanup(): 294 """Clean up temporary files from urlretrieve calls.""" 295 for temp_file in _url_tempfiles: 296 try: 297 os.unlink(temp_file) 298 except OSError: 299 pass 300 301 del _url_tempfiles[:] 302 global _opener 303 if _opener: 304 _opener = None 305 306 # copied from cookielib.py 307 _cut_port_re = re.compile(r":\d+$", re.ASCII) 308 def request_host(request): 309 """Return request-host, as defined by RFC 2965. 310 311 Variation from RFC: returned value is lowercased, for convenient 312 comparison. 313 314 """ 315 url = request.full_url 316 host = urlparse(url)[1] 317 if host == "": 318 host = request.get_header("Host", "") 319 320 # remove port, if present 321 host = _cut_port_re.sub("", host, 1) 322 return host.lower() 323 324 class Request: 325 326 def __init__(self, url, data=None, headers={}, 327 origin_req_host=None, unverifiable=False, 328 method=None): 329 self.full_url = url 330 self.headers = {} 331 self.unredirected_hdrs = {} 332 self._data = None 333 self.data = data 334 self._tunnel_host = None 335 for key, value in headers.items(): 336 self.add_header(key, value) 337 if origin_req_host is None: 338 origin_req_host = request_host(self) 339 self.origin_req_host = origin_req_host 340 self.unverifiable = unverifiable 341 if method: 342 self.method = method 343 344 @property 345 def full_url(self): 346 if self.fragment: 347 return '{}#{}'.format(self._full_url, self.fragment) 348 return self._full_url 349 350 @full_url.setter 351 def full_url(self, url): 352 # unwrap('<URL:type://host/path>') --> 'type://host/path' 353 self._full_url = unwrap(url) 354 self._full_url, self.fragment = splittag(self._full_url) 355 self._parse() 356 357 @full_url.deleter 358 def full_url(self): 359 self._full_url = None 360 self.fragment = None 361 self.selector = '' 362 363 @property 364 def data(self): 365 return self._data 366 367 @data.setter 368 def data(self, data): 369 if data != self._data: 370 self._data = data 371 # issue 16464 372 # if we change data we need to remove content-length header 373 # (cause it's most probably calculated for previous value) 374 if self.has_header("Content-length"): 375 self.remove_header("Content-length") 376 377 @data.deleter 378 def data(self): 379 self.data = None 380 381 def _parse(self): 382 self.type, rest = splittype(self._full_url) 383 if self.type is None: 384 raise ValueError("unknown url type: %r" % self.full_url) 385 self.host, self.selector = splithost(rest) 386 if self.host: 387 self.host = unquote(self.host) 388 389 def get_method(self): 390 """Return a string indicating the HTTP request method.""" 391 default_method = "POST" if self.data is not None else "GET" 392 return getattr(self, 'method', default_method) 393 394 def get_full_url(self): 395 return self.full_url 396 397 def set_proxy(self, host, type): 398 if self.type == 'https' and not self._tunnel_host: 399 self._tunnel_host = self.host 400 else: 401 self.type= type 402 self.selector = self.full_url 403 self.host = host 404 405 def has_proxy(self): 406 return self.selector == self.full_url 407 408 def add_header(self, key, val): 409 # useful for something like authentication 410 self.headers[key.capitalize()] = val 411 412 def add_unredirected_header(self, key, val): 413 # will not be added to a redirected request 414 self.unredirected_hdrs[key.capitalize()] = val 415 416 def has_header(self, header_name): 417 return (header_name in self.headers or 418 header_name in self.unredirected_hdrs) 419 420 def get_header(self, header_name, default=None): 421 return self.headers.get( 422 header_name, 423 self.unredirected_hdrs.get(header_name, default)) 424 425 def remove_header(self, header_name): 426 self.headers.pop(header_name, None) 427 self.unredirected_hdrs.pop(header_name, None) 428 429 def header_items(self): 430 hdrs = self.unredirected_hdrs.copy() 431 hdrs.update(self.headers) 432 return list(hdrs.items()) 433 434 class OpenerDirector: 435 def __init__(self): 436 client_version = "Python-urllib/%s" % __version__ 437 self.addheaders = [('User-agent', client_version)] 438 # self.handlers is retained only for backward compatibility 439 self.handlers = [] 440 # manage the individual handlers 441 self.handle_open = {} 442 self.handle_error = {} 443 self.process_response = {} 444 self.process_request = {} 445 446 def add_handler(self, handler): 447 if not hasattr(handler, "add_parent"): 448 raise TypeError("expected BaseHandler instance, got %r" % 449 type(handler)) 450 451 added = False 452 for meth in dir(handler): 453 if meth in ["redirect_request", "do_open", "proxy_open"]: 454 # oops, coincidental match 455 continue 456 457 i = meth.find("_") 458 protocol = meth[:i] 459 condition = meth[i+1:] 460 461 if condition.startswith("error"): 462 j = condition.find("_") + i + 1 463 kind = meth[j+1:] 464 try: 465 kind = int(kind) 466 except ValueError: 467 pass 468 lookup = self.handle_error.get(protocol, {}) 469 self.handle_error[protocol] = lookup 470 elif condition == "open": 471 kind = protocol 472 lookup = self.handle_open 473 elif condition == "response": 474 kind = protocol 475 lookup = self.process_response 476 elif condition == "request": 477 kind = protocol 478 lookup = self.process_request 479 else: 480 continue 481 482 handlers = lookup.setdefault(kind, []) 483 if handlers: 484 bisect.insort(handlers, handler) 485 else: 486 handlers.append(handler) 487 added = True 488 489 if added: 490 bisect.insort(self.handlers, handler) 491 handler.add_parent(self) 492 493 def close(self): 494 # Only exists for backwards compatibility. 495 pass 496 497 def _call_chain(self, chain, kind, meth_name, *args): 498 # Handlers raise an exception if no one else should try to handle 499 # the request, or return None if they can't but another handler 500 # could. Otherwise, they return the response. 501 handlers = chain.get(kind, ()) 502 for handler in handlers: 503 func = getattr(handler, meth_name) 504 result = func(*args) 505 if result is not None: 506 return result 507 508 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): 509 # accept a URL or a Request object 510 if isinstance(fullurl, str): 511 req = Request(fullurl, data) 512 else: 513 req = fullurl 514 if data is not None: 515 req.data = data 516 517 req.timeout = timeout 518 protocol = req.type 519 520 # pre-process request 521 meth_name = protocol+"_request" 522 for processor in self.process_request.get(protocol, []): 523 meth = getattr(processor, meth_name) 524 req = meth(req) 525 526 response = self._open(req, data) 527 528 # post-process response 529 meth_name = protocol+"_response" 530 for processor in self.process_response.get(protocol, []): 531 meth = getattr(processor, meth_name) 532 response = meth(req, response) 533 534 return response 535 536 def _open(self, req, data=None): 537 result = self._call_chain(self.handle_open, 'default', 538 'default_open', req) 539 if result: 540 return result 541 542 protocol = req.type 543 result = self._call_chain(self.handle_open, protocol, protocol + 544 '_open', req) 545 if result: 546 return result 547 548 return self._call_chain(self.handle_open, 'unknown', 549 'unknown_open', req) 550 551 def error(self, proto, *args): 552 if proto in ('http', 'https'): 553 # XXX http[s] protocols are special-cased 554 dict = self.handle_error['http'] # https is not different than http 555 proto = args[2] # YUCK! 556 meth_name = 'http_error_%s' % proto 557 http_err = 1 558 orig_args = args 559 else: 560 dict = self.handle_error 561 meth_name = proto + '_error' 562 http_err = 0 563 args = (dict, proto, meth_name) + args 564 result = self._call_chain(*args) 565 if result: 566 return result 567 568 if http_err: 569 args = (dict, 'default', 'http_error_default') + orig_args 570 return self._call_chain(*args) 571 572 # XXX probably also want an abstract factory that knows when it makes 573 # sense to skip a superclass in favor of a subclass and when it might 574 # make sense to include both 575 576 def build_opener(*handlers): 577 """Create an opener object from a list of handlers. 578 579 The opener will use several default handlers, including support 580 for HTTP, FTP and when applicable HTTPS. 581 582 If any of the handlers passed as arguments are subclasses of the 583 default handlers, the default handlers will not be used. 584 """ 585 opener = OpenerDirector() 586 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, 587 HTTPDefaultErrorHandler, HTTPRedirectHandler, 588 FTPHandler, FileHandler, HTTPErrorProcessor, 589 DataHandler] 590 if hasattr(http.client, "HTTPSConnection"): 591 default_classes.append(HTTPSHandler) 592 skip = set() 593 for klass in default_classes: 594 for check in handlers: 595 if isinstance(check, type): 596 if issubclass(check, klass): 597 skip.add(klass) 598 elif isinstance(check, klass): 599 skip.add(klass) 600 for klass in skip: 601 default_classes.remove(klass) 602 603 for klass in default_classes: 604 opener.add_handler(klass()) 605 606 for h in handlers: 607 if isinstance(h, type): 608 h = h() 609 opener.add_handler(h) 610 return opener 611 612 class BaseHandler: 613 handler_order = 500 614 615 def add_parent(self, parent): 616 self.parent = parent 617 618 def close(self): 619 # Only exists for backwards compatibility 620 pass 621 622 def __lt__(self, other): 623 if not hasattr(other, "handler_order"): 624 # Try to preserve the old behavior of having custom classes 625 # inserted after default ones (works only for custom user 626 # classes which are not aware of handler_order). 627 return True 628 return self.handler_order < other.handler_order 629 630 631 class HTTPErrorProcessor(BaseHandler): 632 """Process HTTP error responses.""" 633 handler_order = 1000 # after all other processing 634 635 def http_response(self, request, response): 636 code, msg, hdrs = response.code, response.msg, response.info() 637 638 # According to RFC 2616, "2xx" code indicates that the client's 639 # request was successfully received, understood, and accepted. 640 if not (200 <= code < 300): 641 response = self.parent.error( 642 'http', request, response, code, msg, hdrs) 643 644 return response 645 646 https_response = http_response 647 648 class HTTPDefaultErrorHandler(BaseHandler): 649 def http_error_default(self, req, fp, code, msg, hdrs): 650 raise HTTPError(req.full_url, code, msg, hdrs, fp) 651 652 class HTTPRedirectHandler(BaseHandler): 653 # maximum number of redirections to any single URL 654 # this is needed because of the state that cookies introduce 655 max_repeats = 4 656 # maximum total number of redirections (regardless of URL) before 657 # assuming we're in a loop 658 max_redirections = 10 659 660 def redirect_request(self, req, fp, code, msg, headers, newurl): 661 """Return a Request or None in response to a redirect. 662 663 This is called by the http_error_30x methods when a 664 redirection response is received. If a redirection should 665 take place, return a new Request to allow http_error_30x to 666 perform the redirect. Otherwise, raise HTTPError if no-one 667 else should try to handle this url. Return None if you can't 668 but another Handler might. 669 """ 670 m = req.get_method() 671 if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") 672 or code in (301, 302, 303) and m == "POST")): 673 raise HTTPError(req.full_url, code, msg, headers, fp) 674 675 # Strictly (according to RFC 2616), 301 or 302 in response to 676 # a POST MUST NOT cause a redirection without confirmation 677 # from the user (of urllib.request, in this case). In practice, 678 # essentially all clients do redirect in this case, so we do 679 # the same. 680 681 # Be conciliant with URIs containing a space. This is mainly 682 # redundant with the more complete encoding done in http_error_302(), 683 # but it is kept for compatibility with other callers. 684 newurl = newurl.replace(' ', '%20') 685 686 CONTENT_HEADERS = ("content-length", "content-type") 687 newheaders = dict((k, v) for k, v in req.headers.items() 688 if k.lower() not in CONTENT_HEADERS) 689 return Request(newurl, 690 headers=newheaders, 691 origin_req_host=req.origin_req_host, 692 unverifiable=True) 693 694 # Implementation note: To avoid the server sending us into an 695 # infinite loop, the request object needs to track what URLs we 696 # have already seen. Do this by adding a handler-specific 697 # attribute to the Request object. 698 def http_error_302(self, req, fp, code, msg, headers): 699 # Some servers (incorrectly) return multiple Location headers 700 # (so probably same goes for URI). Use first header. 701 if "location" in headers: 702 newurl = headers["location"] 703 elif "uri" in headers: 704 newurl = headers["uri"] 705 else: 706 return 707 708 # fix a possible malformed URL 709 urlparts = urlparse(newurl) 710 711 # For security reasons we don't allow redirection to anything other 712 # than http, https or ftp. 713 714 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 715 raise HTTPError( 716 newurl, code, 717 "%s - Redirection to url '%s' is not allowed" % (msg, newurl), 718 headers, fp) 719 720 if not urlparts.path and urlparts.netloc: 721 urlparts = list(urlparts) 722 urlparts[2] = "/" 723 newurl = urlunparse(urlparts) 724 725 # http.client.parse_headers() decodes as ISO-8859-1. Recover the 726 # original bytes and percent-encode non-ASCII bytes, and any special 727 # characters such as the space. 728 newurl = quote( 729 newurl, encoding="iso-8859-1", safe=string.punctuation) 730 newurl = urljoin(req.full_url, newurl) 731 732 # XXX Probably want to forget about the state of the current 733 # request, although that might interact poorly with other 734 # handlers that also use handler-specific request attributes 735 new = self.redirect_request(req, fp, code, msg, headers, newurl) 736 if new is None: 737 return 738 739 # loop detection 740 # .redirect_dict has a key url if url was previously visited. 741 if hasattr(req, 'redirect_dict'): 742 visited = new.redirect_dict = req.redirect_dict 743 if (visited.get(newurl, 0) >= self.max_repeats or 744 len(visited) >= self.max_redirections): 745 raise HTTPError(req.full_url, code, 746 self.inf_msg + msg, headers, fp) 747 else: 748 visited = new.redirect_dict = req.redirect_dict = {} 749 visited[newurl] = visited.get(newurl, 0) + 1 750 751 # Don't close the fp until we are sure that we won't use it 752 # with HTTPError. 753 fp.read() 754 fp.close() 755 756 return self.parent.open(new, timeout=req.timeout) 757 758 http_error_301 = http_error_303 = http_error_307 = http_error_302 759 760 inf_msg = "The HTTP server returned a redirect error that would " \ 761 "lead to an infinite loop.\n" \ 762 "The last 30x error message was:\n" 763 764 765 def _parse_proxy(proxy): 766 """Return (scheme, user, password, host/port) given a URL or an authority. 767 768 If a URL is supplied, it must have an authority (host:port) component. 769 According to RFC 3986, having an authority component means the URL must 770 have two slashes after the scheme. 771 """ 772 scheme, r_scheme = splittype(proxy) 773 if not r_scheme.startswith("/"): 774 # authority 775 scheme = None 776 authority = proxy 777 else: 778 # URL 779 if not r_scheme.startswith("//"): 780 raise ValueError("proxy URL with no authority: %r" % proxy) 781 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. 782 # and 3.3.), path is empty or starts with '/' 783 end = r_scheme.find("/", 2) 784 if end == -1: 785 end = None 786 authority = r_scheme[2:end] 787 userinfo, hostport = splituser(authority) 788 if userinfo is not None: 789 user, password = splitpasswd(userinfo) 790 else: 791 user = password = None 792 return scheme, user, password, hostport 793 794 class ProxyHandler(BaseHandler): 795 # Proxies must be in front 796 handler_order = 100 797 798 def __init__(self, proxies=None): 799 if proxies is None: 800 proxies = getproxies() 801 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 802 self.proxies = proxies 803 for type, url in proxies.items(): 804 setattr(self, '%s_open' % type, 805 lambda r, proxy=url, type=type, meth=self.proxy_open: 806 meth(r, proxy, type)) 807 808 def proxy_open(self, req, proxy, type): 809 orig_type = req.type 810 proxy_type, user, password, hostport = _parse_proxy(proxy) 811 if proxy_type is None: 812 proxy_type = orig_type 813 814 if req.host and proxy_bypass(req.host): 815 return None 816 817 if user and password: 818 user_pass = '%s:%s' % (unquote(user), 819 unquote(password)) 820 creds = base64.b64encode(user_pass.encode()).decode("ascii") 821 req.add_header('Proxy-authorization', 'Basic ' + creds) 822 hostport = unquote(hostport) 823 req.set_proxy(hostport, proxy_type) 824 if orig_type == proxy_type or orig_type == 'https': 825 # let other handlers take care of it 826 return None 827 else: 828 # need to start over, because the other handlers don't 829 # grok the proxy's URL type 830 # e.g. if we have a constructor arg proxies like so: 831 # {'http': 'ftp://proxy.example.com'}, we may end up turning 832 # a request for http://acme.example.com/a into one for 833 # ftp://proxy.example.com/a 834 return self.parent.open(req, timeout=req.timeout) 835 836 class HTTPPasswordMgr: 837 838 def __init__(self): 839 self.passwd = {} 840 841 def add_password(self, realm, uri, user, passwd): 842 # uri could be a single URI or a sequence 843 if isinstance(uri, str): 844 uri = [uri] 845 if realm not in self.passwd: 846 self.passwd[realm] = {} 847 for default_port in True, False: 848 reduced_uri = tuple( 849 [self.reduce_uri(u, default_port) for u in uri]) 850 self.passwd[realm][reduced_uri] = (user, passwd) 851 852 def find_user_password(self, realm, authuri): 853 domains = self.passwd.get(realm, {}) 854 for default_port in True, False: 855 reduced_authuri = self.reduce_uri(authuri, default_port) 856 for uris, authinfo in domains.items(): 857 for uri in uris: 858 if self.is_suburi(uri, reduced_authuri): 859 return authinfo 860 return None, None 861 862 def reduce_uri(self, uri, default_port=True): 863 """Accept authority or URI and extract only the authority and path.""" 864 # note HTTP URLs do not have a userinfo component 865 parts = urlsplit(uri) 866 if parts[1]: 867 # URI 868 scheme = parts[0] 869 authority = parts[1] 870 path = parts[2] or '/' 871 else: 872 # host or host:port 873 scheme = None 874 authority = uri 875 path = '/' 876 host, port = splitport(authority) 877 if default_port and port is None and scheme is not None: 878 dport = {"http": 80, 879 "https": 443, 880 }.get(scheme) 881 if dport is not None: 882 authority = "%s:%d" % (host, dport) 883 return authority, path 884 885 def is_suburi(self, base, test): 886 """Check if test is below base in a URI tree 887 888 Both args must be URIs in reduced form. 889 """ 890 if base == test: 891 return True 892 if base[0] != test[0]: 893 return False 894 common = posixpath.commonprefix((base[1], test[1])) 895 if len(common) == len(base[1]): 896 return True 897 return False 898 899 900 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): 901 902 def find_user_password(self, realm, authuri): 903 user, password = HTTPPasswordMgr.find_user_password(self, realm, 904 authuri) 905 if user is not None: 906 return user, password 907 return HTTPPasswordMgr.find_user_password(self, None, authuri) 908 909 910 class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): 911 912 def __init__(self, *args, **kwargs): 913 self.authenticated = {} 914 super().__init__(*args, **kwargs) 915 916 def add_password(self, realm, uri, user, passwd, is_authenticated=False): 917 self.update_authenticated(uri, is_authenticated) 918 # Add a default for prior auth requests 919 if realm is not None: 920 super().add_password(None, uri, user, passwd) 921 super().add_password(realm, uri, user, passwd) 922 923 def update_authenticated(self, uri, is_authenticated=False): 924 # uri could be a single URI or a sequence 925 if isinstance(uri, str): 926 uri = [uri] 927 928 for default_port in True, False: 929 for u in uri: 930 reduced_uri = self.reduce_uri(u, default_port) 931 self.authenticated[reduced_uri] = is_authenticated 932 933 def is_authenticated(self, authuri): 934 for default_port in True, False: 935 reduced_authuri = self.reduce_uri(authuri, default_port) 936 for uri in self.authenticated: 937 if self.is_suburi(uri, reduced_authuri): 938 return self.authenticated[uri] 939 940 941 class AbstractBasicAuthHandler: 942 943 # XXX this allows for multiple auth-schemes, but will stupidly pick 944 # the last one with a realm specified. 945 946 # allow for double- and single-quoted realm values 947 # (single quotes are a violation of the RFC, but appear in the wild) 948 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' 949 'realm=(["\']?)([^"\']*)\\2', re.I) 950 951 # XXX could pre-emptively send auth info already accepted (RFC 2617, 952 # end of section 2, and section 1.2 immediately after "credentials" 953 # production). 954 955 def __init__(self, password_mgr=None): 956 if password_mgr is None: 957 password_mgr = HTTPPasswordMgr() 958 self.passwd = password_mgr 959 self.add_password = self.passwd.add_password 960 961 def http_error_auth_reqed(self, authreq, host, req, headers): 962 # host may be an authority (without userinfo) or a URL with an 963 # authority 964 # XXX could be multiple headers 965 authreq = headers.get(authreq, None) 966 967 if authreq: 968 scheme = authreq.split()[0] 969 if scheme.lower() != 'basic': 970 raise ValueError("AbstractBasicAuthHandler does not" 971 " support the following scheme: '%s'" % 972 scheme) 973 else: 974 mo = AbstractBasicAuthHandler.rx.search(authreq) 975 if mo: 976 scheme, quote, realm = mo.groups() 977 if quote not in ['"',"'"]: 978 warnings.warn("Basic Auth Realm was unquoted", 979 UserWarning, 2) 980 if scheme.lower() == 'basic': 981 return self.retry_http_basic_auth(host, req, realm) 982 983 def retry_http_basic_auth(self, host, req, realm): 984 user, pw = self.passwd.find_user_password(realm, host) 985 if pw is not None: 986 raw = "%s:%s" % (user, pw) 987 auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") 988 if req.get_header(self.auth_header, None) == auth: 989 return None 990 req.add_unredirected_header(self.auth_header, auth) 991 return self.parent.open(req, timeout=req.timeout) 992 else: 993 return None 994 995 def http_request(self, req): 996 if (not hasattr(self.passwd, 'is_authenticated') or 997 not self.passwd.is_authenticated(req.full_url)): 998 return req 999 1000 if not req.has_header('Authorization'): 1001 user, passwd = self.passwd.find_user_password(None, req.full_url) 1002 credentials = '{0}:{1}'.format(user, passwd).encode() 1003 auth_str = base64.standard_b64encode(credentials).decode() 1004 req.add_unredirected_header('Authorization', 1005 'Basic {}'.format(auth_str.strip())) 1006 return req 1007 1008 def http_response(self, req, response): 1009 if hasattr(self.passwd, 'is_authenticated'): 1010 if 200 <= response.code < 300: 1011 self.passwd.update_authenticated(req.full_url, True) 1012 else: 1013 self.passwd.update_authenticated(req.full_url, False) 1014 return response 1015 1016 https_request = http_request 1017 https_response = http_response 1018 1019 1020 1021 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1022 1023 auth_header = 'Authorization' 1024 1025 def http_error_401(self, req, fp, code, msg, headers): 1026 url = req.full_url 1027 response = self.http_error_auth_reqed('www-authenticate', 1028 url, req, headers) 1029 return response 1030 1031 1032 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): 1033 1034 auth_header = 'Proxy-authorization' 1035 1036 def http_error_407(self, req, fp, code, msg, headers): 1037 # http_error_auth_reqed requires that there is no userinfo component in 1038 # authority. Assume there isn't one, since urllib.request does not (and 1039 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing 1040 # userinfo. 1041 authority = req.host 1042 response = self.http_error_auth_reqed('proxy-authenticate', 1043 authority, req, headers) 1044 return response 1045 1046 1047 # Return n random bytes. 1048 _randombytes = os.urandom 1049 1050 1051 class AbstractDigestAuthHandler: 1052 # Digest authentication is specified in RFC 2617. 1053 1054 # XXX The client does not inspect the Authentication-Info header 1055 # in a successful response. 1056 1057 # XXX It should be possible to test this implementation against 1058 # a mock server that just generates a static set of challenges. 1059 1060 # XXX qop="auth-int" supports is shaky 1061 1062 def __init__(self, passwd=None): 1063 if passwd is None: 1064 passwd = HTTPPasswordMgr() 1065 self.passwd = passwd 1066 self.add_password = self.passwd.add_password 1067 self.retried = 0 1068 self.nonce_count = 0 1069 self.last_nonce = None 1070 1071 def reset_retry_count(self): 1072 self.retried = 0 1073 1074 def http_error_auth_reqed(self, auth_header, host, req, headers): 1075 authreq = headers.get(auth_header, None) 1076 if self.retried > 5: 1077 # Don't fail endlessly - if we failed once, we'll probably 1078 # fail a second time. Hm. Unless the Password Manager is 1079 # prompting for the information. Crap. This isn't great 1080 # but it's better than the current 'repeat until recursion 1081 # depth exceeded' approach <wink> 1082 raise HTTPError(req.full_url, 401, "digest auth failed", 1083 headers, None) 1084 else: 1085 self.retried += 1 1086 if authreq: 1087 scheme = authreq.split()[0] 1088 if scheme.lower() == 'digest': 1089 return self.retry_http_digest_auth(req, authreq) 1090 elif scheme.lower() != 'basic': 1091 raise ValueError("AbstractDigestAuthHandler does not support" 1092 " the following scheme: '%s'" % scheme) 1093 1094 def retry_http_digest_auth(self, req, auth): 1095 token, challenge = auth.split(' ', 1) 1096 chal = parse_keqv_list(filter(None, parse_http_list(challenge))) 1097 auth = self.get_authorization(req, chal) 1098 if auth: 1099 auth_val = 'Digest %s' % auth 1100 if req.headers.get(self.auth_header, None) == auth_val: 1101 return None 1102 req.add_unredirected_header(self.auth_header, auth_val) 1103 resp = self.parent.open(req, timeout=req.timeout) 1104 return resp 1105 1106 def get_cnonce(self, nonce): 1107 # The cnonce-value is an opaque 1108 # quoted string value provided by the client and used by both client 1109 # and server to avoid chosen plaintext attacks, to provide mutual 1110 # authentication, and to provide some message integrity protection. 1111 # This isn't a fabulous effort, but it's probably Good Enough. 1112 s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) 1113 b = s.encode("ascii") + _randombytes(8) 1114 dig = hashlib.sha1(b).hexdigest() 1115 return dig[:16] 1116 1117 def get_authorization(self, req, chal): 1118 try: 1119 realm = chal['realm'] 1120 nonce = chal['nonce'] 1121 qop = chal.get('qop') 1122 algorithm = chal.get('algorithm', 'MD5') 1123 # mod_digest doesn't send an opaque, even though it isn't 1124 # supposed to be optional 1125 opaque = chal.get('opaque', None) 1126 except KeyError: 1127 return None 1128 1129 H, KD = self.get_algorithm_impls(algorithm) 1130 if H is None: 1131 return None 1132 1133 user, pw = self.passwd.find_user_password(realm, req.full_url) 1134 if user is None: 1135 return None 1136 1137 # XXX not implemented yet 1138 if req.data is not None: 1139 entdig = self.get_entity_digest(req.data, chal) 1140 else: 1141 entdig = None 1142 1143 A1 = "%s:%s:%s" % (user, realm, pw) 1144 A2 = "%s:%s" % (req.get_method(), 1145 # XXX selector: what about proxies and full urls 1146 req.selector) 1147 if qop == 'auth': 1148 if nonce == self.last_nonce: 1149 self.nonce_count += 1 1150 else: 1151 self.nonce_count = 1 1152 self.last_nonce = nonce 1153 ncvalue = '%08x' % self.nonce_count 1154 cnonce = self.get_cnonce(nonce) 1155 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) 1156 respdig = KD(H(A1), noncebit) 1157 elif qop is None: 1158 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) 1159 else: 1160 # XXX handle auth-int. 1161 raise URLError("qop '%s' is not supported." % qop) 1162 1163 # XXX should the partial digests be encoded too? 1164 1165 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ 1166 'response="%s"' % (user, realm, nonce, req.selector, 1167 respdig) 1168 if opaque: 1169 base += ', opaque="%s"' % opaque 1170 if entdig: 1171 base += ', digest="%s"' % entdig 1172 base += ', algorithm="%s"' % algorithm 1173 if qop: 1174 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) 1175 return base 1176 1177 def get_algorithm_impls(self, algorithm): 1178 # lambdas assume digest modules are imported at the top level 1179 if algorithm == 'MD5': 1180 H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() 1181 elif algorithm == 'SHA': 1182 H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() 1183 # XXX MD5-sess 1184 else: 1185 raise ValueError("Unsupported digest authentication " 1186 "algorithm %r" % algorithm) 1187 KD = lambda s, d: H("%s:%s" % (s, d)) 1188 return H, KD 1189 1190 def get_entity_digest(self, data, chal): 1191 # XXX not implemented yet 1192 return None 1193 1194 1195 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1196 """An authentication protocol defined by RFC 2069 1197 1198 Digest authentication improves on basic authentication because it 1199 does not transmit passwords in the clear. 1200 """ 1201 1202 auth_header = 'Authorization' 1203 handler_order = 490 # before Basic auth 1204 1205 def http_error_401(self, req, fp, code, msg, headers): 1206 host = urlparse(req.full_url)[1] 1207 retry = self.http_error_auth_reqed('www-authenticate', 1208 host, req, headers) 1209 self.reset_retry_count() 1210 return retry 1211 1212 1213 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): 1214 1215 auth_header = 'Proxy-Authorization' 1216 handler_order = 490 # before Basic auth 1217 1218 def http_error_407(self, req, fp, code, msg, headers): 1219 host = req.host 1220 retry = self.http_error_auth_reqed('proxy-authenticate', 1221 host, req, headers) 1222 self.reset_retry_count() 1223 return retry 1224 1225 class AbstractHTTPHandler(BaseHandler): 1226 1227 def __init__(self, debuglevel=0): 1228 self._debuglevel = debuglevel 1229 1230 def set_http_debuglevel(self, level): 1231 self._debuglevel = level 1232 1233 def _get_content_length(self, request): 1234 return http.client.HTTPConnection._get_content_length( 1235 request.data, 1236 request.get_method()) 1237 1238 def do_request_(self, request): 1239 host = request.host 1240 if not host: 1241 raise URLError('no host given') 1242 1243 if request.data is not None: # POST 1244 data = request.data 1245 if isinstance(data, str): 1246 msg = "POST data should be bytes, an iterable of bytes, " \ 1247 "or a file object. It cannot be of type str." 1248 raise TypeError(msg) 1249 if not request.has_header('Content-type'): 1250 request.add_unredirected_header( 1251 'Content-type', 1252 'application/x-www-form-urlencoded') 1253 if (not request.has_header('Content-length') 1254 and not request.has_header('Transfer-encoding')): 1255 content_length = self._get_content_length(request) 1256 if content_length is not None: 1257 request.add_unredirected_header( 1258 'Content-length', str(content_length)) 1259 else: 1260 request.add_unredirected_header( 1261 'Transfer-encoding', 'chunked') 1262 1263 sel_host = host 1264 if request.has_proxy(): 1265 scheme, sel = splittype(request.selector) 1266 sel_host, sel_path = splithost(sel) 1267 if not request.has_header('Host'): 1268 request.add_unredirected_header('Host', sel_host) 1269 for name, value in self.parent.addheaders: 1270 name = name.capitalize() 1271 if not request.has_header(name): 1272 request.add_unredirected_header(name, value) 1273 1274 return request 1275 1276 def do_open(self, http_class, req, **http_conn_args): 1277 """Return an HTTPResponse object for the request, using http_class. 1278 1279 http_class must implement the HTTPConnection API from http.client. 1280 """ 1281 host = req.host 1282 if not host: 1283 raise URLError('no host given') 1284 1285 # will parse host:port 1286 h = http_class(host, timeout=req.timeout, **http_conn_args) 1287 h.set_debuglevel(self._debuglevel) 1288 1289 headers = dict(req.unredirected_hdrs) 1290 headers.update(dict((k, v) for k, v in req.headers.items() 1291 if k not in headers)) 1292 1293 # TODO(jhylton): Should this be redesigned to handle 1294 # persistent connections? 1295 1296 # We want to make an HTTP/1.1 request, but the addinfourl 1297 # class isn't prepared to deal with a persistent connection. 1298 # It will try to read all remaining data from the socket, 1299 # which will block while the server waits for the next request. 1300 # So make sure the connection gets closed after the (only) 1301 # request. 1302 headers["Connection"] = "close" 1303 headers = dict((name.title(), val) for name, val in headers.items()) 1304 1305 if req._tunnel_host: 1306 tunnel_headers = {} 1307 proxy_auth_hdr = "Proxy-Authorization" 1308 if proxy_auth_hdr in headers: 1309 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 1310 # Proxy-Authorization should not be sent to origin 1311 # server. 1312 del headers[proxy_auth_hdr] 1313 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 1314 1315 try: 1316 try: 1317 h.request(req.get_method(), req.selector, req.data, headers, 1318 encode_chunked=req.has_header('Transfer-encoding')) 1319 except OSError as err: # timeout error 1320 raise URLError(err) 1321 r = h.getresponse() 1322 except: 1323 h.close() 1324 raise 1325 1326 # If the server does not send us a 'Connection: close' header, 1327 # HTTPConnection assumes the socket should be left open. Manually 1328 # mark the socket to be closed when this response object goes away. 1329 if h.sock: 1330 h.sock.close() 1331 h.sock = None 1332 1333 r.url = req.get_full_url() 1334 # This line replaces the .msg attribute of the HTTPResponse 1335 # with .headers, because urllib clients expect the response to 1336 # have the reason in .msg. It would be good to mark this 1337 # attribute is deprecated and get then to use info() or 1338 # .headers. 1339 r.msg = r.reason 1340 return r 1341 1342 1343 class HTTPHandler(AbstractHTTPHandler): 1344 1345 def http_open(self, req): 1346 return self.do_open(http.client.HTTPConnection, req) 1347 1348 http_request = AbstractHTTPHandler.do_request_ 1349 1350 if hasattr(http.client, 'HTTPSConnection'): 1351 1352 class HTTPSHandler(AbstractHTTPHandler): 1353 1354 def __init__(self, debuglevel=0, context=None, check_hostname=None): 1355 AbstractHTTPHandler.__init__(self, debuglevel) 1356 self._context = context 1357 self._check_hostname = check_hostname 1358 1359 def https_open(self, req): 1360 return self.do_open(http.client.HTTPSConnection, req, 1361 context=self._context, check_hostname=self._check_hostname) 1362 1363 https_request = AbstractHTTPHandler.do_request_ 1364 1365 __all__.append('HTTPSHandler') 1366 1367 class HTTPCookieProcessor(BaseHandler): 1368 def __init__(self, cookiejar=None): 1369 import http.cookiejar 1370 if cookiejar is None: 1371 cookiejar = http.cookiejar.CookieJar() 1372 self.cookiejar = cookiejar 1373 1374 def http_request(self, request): 1375 self.cookiejar.add_cookie_header(request) 1376 return request 1377 1378 def http_response(self, request, response): 1379 self.cookiejar.extract_cookies(response, request) 1380 return response 1381 1382 https_request = http_request 1383 https_response = http_response 1384 1385 class UnknownHandler(BaseHandler): 1386 def unknown_open(self, req): 1387 type = req.type 1388 raise URLError('unknown url type: %s' % type) 1389 1390 def parse_keqv_list(l): 1391 """Parse list of key=value strings where keys are not duplicated.""" 1392 parsed = {} 1393 for elt in l: 1394 k, v = elt.split('=', 1) 1395 if v[0] == '"' and v[-1] == '"': 1396 v = v[1:-1] 1397 parsed[k] = v 1398 return parsed 1399 1400 def parse_http_list(s): 1401 """Parse lists as described by RFC 2068 Section 2. 1402 1403 In particular, parse comma-separated lists where the elements of 1404 the list may include quoted-strings. A quoted-string could 1405 contain a comma. A non-quoted string could have quotes in the 1406 middle. Neither commas nor quotes count if they are escaped. 1407 Only double-quotes count, not single-quotes. 1408 """ 1409 res = [] 1410 part = '' 1411 1412 escape = quote = False 1413 for cur in s: 1414 if escape: 1415 part += cur 1416 escape = False 1417 continue 1418 if quote: 1419 if cur == '\\': 1420 escape = True 1421 continue 1422 elif cur == '"': 1423 quote = False 1424 part += cur 1425 continue 1426 1427 if cur == ',': 1428 res.append(part) 1429 part = '' 1430 continue 1431 1432 if cur == '"': 1433 quote = True 1434 1435 part += cur 1436 1437 # append last part 1438 if part: 1439 res.append(part) 1440 1441 return [part.strip() for part in res] 1442 1443 class FileHandler(BaseHandler): 1444 # Use local file or FTP depending on form of URL 1445 def file_open(self, req): 1446 url = req.selector 1447 if url[:2] == '//' and url[2:3] != '/' and (req.host and 1448 req.host != 'localhost'): 1449 if not req.host in self.get_names(): 1450 raise URLError("file:// scheme is supported only on localhost") 1451 else: 1452 return self.open_local_file(req) 1453 1454 # names for the localhost 1455 names = None 1456 def get_names(self): 1457 if FileHandler.names is None: 1458 try: 1459 FileHandler.names = tuple( 1460 socket.gethostbyname_ex('localhost')[2] + 1461 socket.gethostbyname_ex(socket.gethostname())[2]) 1462 except socket.gaierror: 1463 FileHandler.names = (socket.gethostbyname('localhost'),) 1464 return FileHandler.names 1465 1466 # not entirely sure what the rules are here 1467 def open_local_file(self, req): 1468 import email.utils 1469 import mimetypes 1470 host = req.host 1471 filename = req.selector 1472 localfile = url2pathname(filename) 1473 try: 1474 stats = os.stat(localfile) 1475 size = stats.st_size 1476 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1477 mtype = mimetypes.guess_type(filename)[0] 1478 headers = email.message_from_string( 1479 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % 1480 (mtype or 'text/plain', size, modified)) 1481 if host: 1482 host, port = splitport(host) 1483 if not host or \ 1484 (not port and _safe_gethostbyname(host) in self.get_names()): 1485 if host: 1486 origurl = 'file://' + host + filename 1487 else: 1488 origurl = 'file://' + filename 1489 return addinfourl(open(localfile, 'rb'), headers, origurl) 1490 except OSError as exp: 1491 # users shouldn't expect OSErrors coming from urlopen() 1492 raise URLError(exp) 1493 raise URLError('file not on local host') 1494 1495 def _safe_gethostbyname(host): 1496 try: 1497 return socket.gethostbyname(host) 1498 except socket.gaierror: 1499 return None 1500 1501 class FTPHandler(BaseHandler): 1502 def ftp_open(self, req): 1503 import ftplib 1504 import mimetypes 1505 host = req.host 1506 if not host: 1507 raise URLError('ftp error: no host given') 1508 host, port = splitport(host) 1509 if port is None: 1510 port = ftplib.FTP_PORT 1511 else: 1512 port = int(port) 1513 1514 # username/password handling 1515 user, host = splituser(host) 1516 if user: 1517 user, passwd = splitpasswd(user) 1518 else: 1519 passwd = None 1520 host = unquote(host) 1521 user = user or '' 1522 passwd = passwd or '' 1523 1524 try: 1525 host = socket.gethostbyname(host) 1526 except OSError as msg: 1527 raise URLError(msg) 1528 path, attrs = splitattr(req.selector) 1529 dirs = path.split('/') 1530 dirs = list(map(unquote, dirs)) 1531 dirs, file = dirs[:-1], dirs[-1] 1532 if dirs and not dirs[0]: 1533 dirs = dirs[1:] 1534 try: 1535 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) 1536 type = file and 'I' or 'D' 1537 for attr in attrs: 1538 attr, value = splitvalue(attr) 1539 if attr.lower() == 'type' and \ 1540 value in ('a', 'A', 'i', 'I', 'd', 'D'): 1541 type = value.upper() 1542 fp, retrlen = fw.retrfile(file, type) 1543 headers = "" 1544 mtype = mimetypes.guess_type(req.full_url)[0] 1545 if mtype: 1546 headers += "Content-type: %s\n" % mtype 1547 if retrlen is not None and retrlen >= 0: 1548 headers += "Content-length: %d\n" % retrlen 1549 headers = email.message_from_string(headers) 1550 return addinfourl(fp, headers, req.full_url) 1551 except ftplib.all_errors as exp: 1552 exc = URLError('ftp error: %r' % exp) 1553 raise exc.with_traceback(sys.exc_info()[2]) 1554 1555 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1556 return ftpwrapper(user, passwd, host, port, dirs, timeout, 1557 persistent=False) 1558 1559 class CacheFTPHandler(FTPHandler): 1560 # XXX would be nice to have pluggable cache strategies 1561 # XXX this stuff is definitely not thread safe 1562 def __init__(self): 1563 self.cache = {} 1564 self.timeout = {} 1565 self.soonest = 0 1566 self.delay = 60 1567 self.max_conns = 16 1568 1569 def setTimeout(self, t): 1570 self.delay = t 1571 1572 def setMaxConns(self, m): 1573 self.max_conns = m 1574 1575 def connect_ftp(self, user, passwd, host, port, dirs, timeout): 1576 key = user, host, port, '/'.join(dirs), timeout 1577 if key in self.cache: 1578 self.timeout[key] = time.time() + self.delay 1579 else: 1580 self.cache[key] = ftpwrapper(user, passwd, host, port, 1581 dirs, timeout) 1582 self.timeout[key] = time.time() + self.delay 1583 self.check_cache() 1584 return self.cache[key] 1585 1586 def check_cache(self): 1587 # first check for old ones 1588 t = time.time() 1589 if self.soonest <= t: 1590 for k, v in list(self.timeout.items()): 1591 if v < t: 1592 self.cache[k].close() 1593 del self.cache[k] 1594 del self.timeout[k] 1595 self.soonest = min(list(self.timeout.values())) 1596 1597 # then check the size 1598 if len(self.cache) == self.max_conns: 1599 for k, v in list(self.timeout.items()): 1600 if v == self.soonest: 1601 del self.cache[k] 1602 del self.timeout[k] 1603 break 1604 self.soonest = min(list(self.timeout.values())) 1605 1606 def clear_cache(self): 1607 for conn in self.cache.values(): 1608 conn.close() 1609 self.cache.clear() 1610 self.timeout.clear() 1611 1612 class DataHandler(BaseHandler): 1613 def data_open(self, req): 1614 # data URLs as specified in RFC 2397. 1615 # 1616 # ignores POSTed data 1617 # 1618 # syntax: 1619 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 1620 # mediatype := [ type "/" subtype ] *( ";" parameter ) 1621 # data := *urlchar 1622 # parameter := attribute "=" value 1623 url = req.full_url 1624 1625 scheme, data = url.split(":",1) 1626 mediatype, data = data.split(",",1) 1627 1628 # even base64 encoded data URLs might be quoted so unquote in any case: 1629 data = unquote_to_bytes(data) 1630 if mediatype.endswith(";base64"): 1631 data = base64.decodebytes(data) 1632 mediatype = mediatype[:-7] 1633 1634 if not mediatype: 1635 mediatype = "text/plain;charset=US-ASCII" 1636 1637 headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % 1638 (mediatype, len(data))) 1639 1640 return addinfourl(io.BytesIO(data), headers, url) 1641 1642 1643 # Code move from the old urllib module 1644 1645 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size 1646 1647 # Helper for non-unix systems 1648 if os.name == 'nt': 1649 from nturl2path import url2pathname, pathname2url 1650 else: 1651 def url2pathname(pathname): 1652 """OS-specific conversion from a relative URL of the 'file' scheme 1653 to a file system path; not recommended for general use.""" 1654 return unquote(pathname) 1655 1656 def pathname2url(pathname): 1657 """OS-specific conversion from a file system path to a relative URL 1658 of the 'file' scheme; not recommended for general use.""" 1659 return quote(pathname) 1660 1661 # This really consists of two pieces: 1662 # (1) a class which handles opening of all sorts of URLs 1663 # (plus assorted utilities etc.) 1664 # (2) a set of functions for parsing URLs 1665 # XXX Should these be separated out into different modules? 1666 1667 1668 ftpcache = {} 1669 class URLopener: 1670 """Class to open URLs. 1671 This is a class rather than just a subroutine because we may need 1672 more than one set of global protocol-specific options. 1673 Note -- this is a base class for those who don't want the 1674 automatic handling of errors type 302 (relocated) and 401 1675 (authorization needed).""" 1676 1677 __tempfiles = None 1678 1679 version = "Python-urllib/%s" % __version__ 1680 1681 # Constructor 1682 def __init__(self, proxies=None, **x509): 1683 msg = "%(class)s style of invoking requests is deprecated. " \ 1684 "Use newer urlopen functions/methods" % {'class': self.__class__.__name__} 1685 warnings.warn(msg, DeprecationWarning, stacklevel=3) 1686 if proxies is None: 1687 proxies = getproxies() 1688 assert hasattr(proxies, 'keys'), "proxies must be a mapping" 1689 self.proxies = proxies 1690 self.key_file = x509.get('key_file') 1691 self.cert_file = x509.get('cert_file') 1692 self.addheaders = [('User-Agent', self.version), ('Accept', '*/*')] 1693 self.__tempfiles = [] 1694 self.__unlink = os.unlink # See cleanup() 1695 self.tempcache = None 1696 # Undocumented feature: if you assign {} to tempcache, 1697 # it is used to cache files retrieved with 1698 # self.retrieve(). This is not enabled by default 1699 # since it does not work for changing documents (and I 1700 # haven't got the logic to check expiration headers 1701 # yet). 1702 self.ftpcache = ftpcache 1703 # Undocumented feature: you can use a different 1704 # ftp cache by assigning to the .ftpcache member; 1705 # in case you want logically independent URL openers 1706 # XXX This is not threadsafe. Bah. 1707 1708 def __del__(self): 1709 self.close() 1710 1711 def close(self): 1712 self.cleanup() 1713 1714 def cleanup(self): 1715 # This code sometimes runs when the rest of this module 1716 # has already been deleted, so it can't use any globals 1717 # or import anything. 1718 if self.__tempfiles: 1719 for file in self.__tempfiles: 1720 try: 1721 self.__unlink(file) 1722 except OSError: 1723 pass 1724 del self.__tempfiles[:] 1725 if self.tempcache: 1726 self.tempcache.clear() 1727 1728 def addheader(self, *args): 1729 """Add a header to be used by the HTTP interface only 1730 e.g. u.addheader('Accept', 'sound/basic')""" 1731 self.addheaders.append(args) 1732 1733 # External interface 1734 def open(self, fullurl, data=None): 1735 """Use URLopener().open(file) instead of open(file, 'r').""" 1736 fullurl = unwrap(to_bytes(fullurl)) 1737 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|") 1738 if self.tempcache and fullurl in self.tempcache: 1739 filename, headers = self.tempcache[fullurl] 1740 fp = open(filename, 'rb') 1741 return addinfourl(fp, headers, fullurl) 1742 urltype, url = splittype(fullurl) 1743 if not urltype: 1744 urltype = 'file' 1745 if urltype in self.proxies: 1746 proxy = self.proxies[urltype] 1747 urltype, proxyhost = splittype(proxy) 1748 host, selector = splithost(proxyhost) 1749 url = (host, fullurl) # Signal special case to open_*() 1750 else: 1751 proxy = None 1752 name = 'open_' + urltype 1753 self.type = urltype 1754 name = name.replace('-', '_') 1755 if not hasattr(self, name): 1756 if proxy: 1757 return self.open_unknown_proxy(proxy, fullurl, data) 1758 else: 1759 return self.open_unknown(fullurl, data) 1760 try: 1761 if data is None: 1762 return getattr(self, name)(url) 1763 else: 1764 return getattr(self, name)(url, data) 1765 except (HTTPError, URLError): 1766 raise 1767 except OSError as msg: 1768 raise OSError('socket error', msg).with_traceback(sys.exc_info()[2]) 1769 1770 def open_unknown(self, fullurl, data=None): 1771 """Overridable interface to open unknown URL type.""" 1772 type, url = splittype(fullurl) 1773 raise OSError('url error', 'unknown url type', type) 1774 1775 def open_unknown_proxy(self, proxy, fullurl, data=None): 1776 """Overridable interface to open unknown URL type.""" 1777 type, url = splittype(fullurl) 1778 raise OSError('url error', 'invalid proxy for %s' % type, proxy) 1779 1780 # External interface 1781 def retrieve(self, url, filename=None, reporthook=None, data=None): 1782 """retrieve(url) returns (filename, headers) for a local object 1783 or (tempfilename, headers) for a remote object.""" 1784 url = unwrap(to_bytes(url)) 1785 if self.tempcache and url in self.tempcache: 1786 return self.tempcache[url] 1787 type, url1 = splittype(url) 1788 if filename is None and (not type or type == 'file'): 1789 try: 1790 fp = self.open_local_file(url1) 1791 hdrs = fp.info() 1792 fp.close() 1793 return url2pathname(splithost(url1)[1]), hdrs 1794 except OSError as msg: 1795 pass 1796 fp = self.open(url, data) 1797 try: 1798 headers = fp.info() 1799 if filename: 1800 tfp = open(filename, 'wb') 1801 else: 1802 import tempfile 1803 garbage, path = splittype(url) 1804 garbage, path = splithost(path or "") 1805 path, garbage = splitquery(path or "") 1806 path, garbage = splitattr(path or "") 1807 suffix = os.path.splitext(path)[1] 1808 (fd, filename) = tempfile.mkstemp(suffix) 1809 self.__tempfiles.append(filename) 1810 tfp = os.fdopen(fd, 'wb') 1811 try: 1812 result = filename, headers 1813 if self.tempcache is not None: 1814 self.tempcache[url] = result 1815 bs = 1024*8 1816 size = -1 1817 read = 0 1818 blocknum = 0 1819 if "content-length" in headers: 1820 size = int(headers["Content-Length"]) 1821 if reporthook: 1822 reporthook(blocknum, bs, size) 1823 while 1: 1824 block = fp.read(bs) 1825 if not block: 1826 break 1827 read += len(block) 1828 tfp.write(block) 1829 blocknum += 1 1830 if reporthook: 1831 reporthook(blocknum, bs, size) 1832 finally: 1833 tfp.close() 1834 finally: 1835 fp.close() 1836 1837 # raise exception if actual size does not match content-length header 1838 if size >= 0 and read < size: 1839 raise ContentTooShortError( 1840 "retrieval incomplete: got only %i out of %i bytes" 1841 % (read, size), result) 1842 1843 return result 1844 1845 # Each method named open_<type> knows how to open that type of URL 1846 1847 def _open_generic_http(self, connection_factory, url, data): 1848 """Make an HTTP connection using connection_class. 1849 1850 This is an internal method that should be called from 1851 open_http() or open_https(). 1852 1853 Arguments: 1854 - connection_factory should take a host name and return an 1855 HTTPConnection instance. 1856 - url is the url to retrieval or a host, relative-path pair. 1857 - data is payload for a POST request or None. 1858 """ 1859 1860 user_passwd = None 1861 proxy_passwd= None 1862 if isinstance(url, str): 1863 host, selector = splithost(url) 1864 if host: 1865 user_passwd, host = splituser(host) 1866 host = unquote(host) 1867 realhost = host 1868 else: 1869 host, selector = url 1870 # check whether the proxy contains authorization information 1871 proxy_passwd, host = splituser(host) 1872 # now we proceed with the url we want to obtain 1873 urltype, rest = splittype(selector) 1874 url = rest 1875 user_passwd = None 1876 if urltype.lower() != 'http': 1877 realhost = None 1878 else: 1879 realhost, rest = splithost(rest) 1880 if realhost: 1881 user_passwd, realhost = splituser(realhost) 1882 if user_passwd: 1883 selector = "%s://%s%s" % (urltype, realhost, rest) 1884 if proxy_bypass(realhost): 1885 host = realhost 1886 1887 if not host: raise OSError('http error', 'no host given') 1888 1889 if proxy_passwd: 1890 proxy_passwd = unquote(proxy_passwd) 1891 proxy_auth = base64.b64encode(proxy_passwd.encode()).decode('ascii') 1892 else: 1893 proxy_auth = None 1894 1895 if user_passwd: 1896 user_passwd = unquote(user_passwd) 1897 auth = base64.b64encode(user_passwd.encode()).decode('ascii') 1898 else: 1899 auth = None 1900 http_conn = connection_factory(host) 1901 headers = {} 1902 if proxy_auth: 1903 headers["Proxy-Authorization"] = "Basic %s" % proxy_auth 1904 if auth: 1905 headers["Authorization"] = "Basic %s" % auth 1906 if realhost: 1907 headers["Host"] = realhost 1908 1909 # Add Connection:close as we don't support persistent connections yet. 1910 # This helps in closing the socket and avoiding ResourceWarning 1911 1912 headers["Connection"] = "close" 1913 1914 for header, value in self.addheaders: 1915 headers[header] = value 1916 1917 if data is not None: 1918 headers["Content-Type"] = "application/x-www-form-urlencoded" 1919 http_conn.request("POST", selector, data, headers) 1920 else: 1921 http_conn.request("GET", selector, headers=headers) 1922 1923 try: 1924 response = http_conn.getresponse() 1925 except http.client.BadStatusLine: 1926 # something went wrong with the HTTP status line 1927 raise URLError("http protocol error: bad status line") 1928 1929 # According to RFC 2616, "2xx" code indicates that the client's 1930 # request was successfully received, understood, and accepted. 1931 if 200 <= response.status < 300: 1932 return addinfourl(response, response.msg, "http:" + url, 1933 response.status) 1934 else: 1935 return self.http_error( 1936 url, response.fp, 1937 response.status, response.reason, response.msg, data) 1938 1939 def open_http(self, url, data=None): 1940 """Use HTTP protocol.""" 1941 return self._open_generic_http(http.client.HTTPConnection, url, data) 1942 1943 def http_error(self, url, fp, errcode, errmsg, headers, data=None): 1944 """Handle http errors. 1945 1946 Derived class can override this, or provide specific handlers 1947 named http_error_DDD where DDD is the 3-digit error code.""" 1948 # First check if there's a specific handler for this error 1949 name = 'http_error_%d' % errcode 1950 if hasattr(self, name): 1951 method = getattr(self, name) 1952 if data is None: 1953 result = method(url, fp, errcode, errmsg, headers) 1954 else: 1955 result = method(url, fp, errcode, errmsg, headers, data) 1956 if result: return result 1957 return self.http_error_default(url, fp, errcode, errmsg, headers) 1958 1959 def http_error_default(self, url, fp, errcode, errmsg, headers): 1960 """Default error handler: close the connection and raise OSError.""" 1961 fp.close() 1962 raise HTTPError(url, errcode, errmsg, headers, None) 1963 1964 if _have_ssl: 1965 def _https_connection(self, host): 1966 return http.client.HTTPSConnection(host, 1967 key_file=self.key_file, 1968 cert_file=self.cert_file) 1969 1970 def open_https(self, url, data=None): 1971 """Use HTTPS protocol.""" 1972 return self._open_generic_http(self._https_connection, url, data) 1973 1974 def open_file(self, url): 1975 """Use local file or FTP depending on form of URL.""" 1976 if not isinstance(url, str): 1977 raise URLError('file error: proxy support for file protocol currently not implemented') 1978 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': 1979 raise ValueError("file:// scheme is supported only on localhost") 1980 else: 1981 return self.open_local_file(url) 1982 1983 def open_local_file(self, url): 1984 """Use local file.""" 1985 import email.utils 1986 import mimetypes 1987 host, file = splithost(url) 1988 localname = url2pathname(file) 1989 try: 1990 stats = os.stat(localname) 1991 except OSError as e: 1992 raise URLError(e.strerror, e.filename) 1993 size = stats.st_size 1994 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) 1995 mtype = mimetypes.guess_type(url)[0] 1996 headers = email.message_from_string( 1997 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % 1998 (mtype or 'text/plain', size, modified)) 1999 if not host: 2000 urlfile = file 2001 if file[:1] == '/': 2002 urlfile = 'file://' + file 2003 return addinfourl(open(localname, 'rb'), headers, urlfile) 2004 host, port = splitport(host) 2005 if (not port 2006 and socket.gethostbyname(host) in ((localhost(),) + thishost())): 2007 urlfile = file 2008 if file[:1] == '/': 2009 urlfile = 'file://' + file 2010 elif file[:2] == './': 2011 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url) 2012 return addinfourl(open(localname, 'rb'), headers, urlfile) 2013 raise URLError('local file error: not on local host') 2014 2015 def open_ftp(self, url): 2016 """Use FTP protocol.""" 2017 if not isinstance(url, str): 2018 raise URLError('ftp error: proxy support for ftp protocol currently not implemented') 2019 import mimetypes 2020 host, path = splithost(url) 2021 if not host: raise URLError('ftp error: no host given') 2022 host, port = splitport(host) 2023 user, host = splituser(host) 2024 if user: user, passwd = splitpasswd(user) 2025 else: passwd = None 2026 host = unquote(host) 2027 user = unquote(user or '') 2028 passwd = unquote(passwd or '') 2029 host = socket.gethostbyname(host) 2030 if not port: 2031 import ftplib 2032 port = ftplib.FTP_PORT 2033 else: 2034 port = int(port) 2035 path, attrs = splitattr(path) 2036 path = unquote(path) 2037 dirs = path.split('/') 2038 dirs, file = dirs[:-1], dirs[-1] 2039 if dirs and not dirs[0]: dirs = dirs[1:] 2040 if dirs and not dirs[0]: dirs[0] = '/' 2041 key = user, host, port, '/'.join(dirs) 2042 # XXX thread unsafe! 2043 if len(self.ftpcache) > MAXFTPCACHE: 2044 # Prune the cache, rather arbitrarily 2045 for k in list(self.ftpcache): 2046 if k != key: 2047 v = self.ftpcache[k] 2048 del self.ftpcache[k] 2049 v.close() 2050 try: 2051 if key not in self.ftpcache: 2052 self.ftpcache[key] = \ 2053 ftpwrapper(user, passwd, host, port, dirs) 2054 if not file: type = 'D' 2055 else: type = 'I' 2056 for attr in attrs: 2057 attr, value = splitvalue(attr) 2058 if attr.lower() == 'type' and \ 2059 value in ('a', 'A', 'i', 'I', 'd', 'D'): 2060 type = value.upper() 2061 (fp, retrlen) = self.ftpcache[key].retrfile(file, type) 2062 mtype = mimetypes.guess_type("ftp:" + url)[0] 2063 headers = "" 2064 if mtype: 2065 headers += "Content-Type: %s\n" % mtype 2066 if retrlen is not None and retrlen >= 0: 2067 headers += "Content-Length: %d\n" % retrlen 2068 headers = email.message_from_string(headers) 2069 return addinfourl(fp, headers, "ftp:" + url) 2070 except ftperrors() as exp: 2071 raise URLError('ftp error %r' % exp).with_traceback(sys.exc_info()[2]) 2072 2073 def open_data(self, url, data=None): 2074 """Use "data" URL.""" 2075 if not isinstance(url, str): 2076 raise URLError('data error: proxy support for data protocol currently not implemented') 2077 # ignore POSTed data 2078 # 2079 # syntax of data URLs: 2080 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 2081 # mediatype := [ type "/" subtype ] *( ";" parameter ) 2082 # data := *urlchar 2083 # parameter := attribute "=" value 2084 try: 2085 [type, data] = url.split(',', 1) 2086 except ValueError: 2087 raise OSError('data error', 'bad data URL') 2088 if not type: 2089 type = 'text/plain;charset=US-ASCII' 2090 semi = type.rfind(';') 2091 if semi >= 0 and '=' not in type[semi:]: 2092 encoding = type[semi+1:] 2093 type = type[:semi] 2094 else: 2095 encoding = '' 2096 msg = [] 2097 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT', 2098 time.gmtime(time.time()))) 2099 msg.append('Content-type: %s' % type) 2100 if encoding == 'base64': 2101 # XXX is this encoding/decoding ok? 2102 data = base64.decodebytes(data.encode('ascii')).decode('latin-1') 2103 else: 2104 data = unquote(data) 2105 msg.append('Content-Length: %d' % len(data)) 2106 msg.append('') 2107 msg.append(data) 2108 msg = '\n'.join(msg) 2109 headers = email.message_from_string(msg) 2110 f = io.StringIO(msg) 2111 #f.fileno = None # needed for addinfourl 2112 return addinfourl(f, headers, url) 2113 2114 2115 class FancyURLopener(URLopener): 2116 """Derived class with handlers for errors we can handle (perhaps).""" 2117 2118 def __init__(self, *args, **kwargs): 2119 URLopener.__init__(self, *args, **kwargs) 2120 self.auth_cache = {} 2121 self.tries = 0 2122 self.maxtries = 10 2123 2124 def http_error_default(self, url, fp, errcode, errmsg, headers): 2125 """Default error handling -- don't raise an exception.""" 2126 return addinfourl(fp, headers, "http:" + url, errcode) 2127 2128 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): 2129 """Error 302 -- relocated (temporarily).""" 2130 self.tries += 1 2131 try: 2132 if self.maxtries and self.tries >= self.maxtries: 2133 if hasattr(self, "http_error_500"): 2134 meth = self.http_error_500 2135 else: 2136 meth = self.http_error_default 2137 return meth(url, fp, 500, 2138 "Internal Server Error: Redirect Recursion", 2139 headers) 2140 result = self.redirect_internal(url, fp, errcode, errmsg, 2141 headers, data) 2142 return result 2143 finally: 2144 self.tries = 0 2145 2146 def redirect_internal(self, url, fp, errcode, errmsg, headers, data): 2147 if 'location' in headers: 2148 newurl = headers['location'] 2149 elif 'uri' in headers: 2150 newurl = headers['uri'] 2151 else: 2152 return 2153 fp.close() 2154 2155 # In case the server sent a relative URL, join with original: 2156 newurl = urljoin(self.type + ":" + url, newurl) 2157 2158 urlparts = urlparse(newurl) 2159 2160 # For security reasons, we don't allow redirection to anything other 2161 # than http, https and ftp. 2162 2163 # We are using newer HTTPError with older redirect_internal method 2164 # This older method will get deprecated in 3.3 2165 2166 if urlparts.scheme not in ('http', 'https', 'ftp', ''): 2167 raise HTTPError(newurl, errcode, 2168 errmsg + 2169 " Redirection to url '%s' is not allowed." % newurl, 2170 headers, fp) 2171 2172 return self.open(newurl) 2173 2174 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): 2175 """Error 301 -- also relocated (permanently).""" 2176 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2177 2178 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): 2179 """Error 303 -- also relocated (essentially identical to 302).""" 2180 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2181 2182 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): 2183 """Error 307 -- relocated, but turn POST into error.""" 2184 if data is None: 2185 return self.http_error_302(url, fp, errcode, errmsg, headers, data) 2186 else: 2187 return self.http_error_default(url, fp, errcode, errmsg, headers) 2188 2189 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None, 2190 retry=False): 2191 """Error 401 -- authentication required. 2192 This function supports Basic authentication only.""" 2193 if 'www-authenticate' not in headers: 2194 URLopener.http_error_default(self, url, fp, 2195 errcode, errmsg, headers) 2196 stuff = headers['www-authenticate'] 2197 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2198 if not match: 2199 URLopener.http_error_default(self, url, fp, 2200 errcode, errmsg, headers) 2201 scheme, realm = match.groups() 2202 if scheme.lower() != 'basic': 2203 URLopener.http_error_default(self, url, fp, 2204 errcode, errmsg, headers) 2205 if not retry: 2206 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2207 headers) 2208 name = 'retry_' + self.type + '_basic_auth' 2209 if data is None: 2210 return getattr(self,name)(url, realm) 2211 else: 2212 return getattr(self,name)(url, realm, data) 2213 2214 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None, 2215 retry=False): 2216 """Error 407 -- proxy authentication required. 2217 This function supports Basic authentication only.""" 2218 if 'proxy-authenticate' not in headers: 2219 URLopener.http_error_default(self, url, fp, 2220 errcode, errmsg, headers) 2221 stuff = headers['proxy-authenticate'] 2222 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) 2223 if not match: 2224 URLopener.http_error_default(self, url, fp, 2225 errcode, errmsg, headers) 2226 scheme, realm = match.groups() 2227 if scheme.lower() != 'basic': 2228 URLopener.http_error_default(self, url, fp, 2229 errcode, errmsg, headers) 2230 if not retry: 2231 URLopener.http_error_default(self, url, fp, errcode, errmsg, 2232 headers) 2233 name = 'retry_proxy_' + self.type + '_basic_auth' 2234 if data is None: 2235 return getattr(self,name)(url, realm) 2236 else: 2237 return getattr(self,name)(url, realm, data) 2238 2239 def retry_proxy_http_basic_auth(self, url, realm, data=None): 2240 host, selector = splithost(url) 2241 newurl = 'http://' + host + selector 2242 proxy = self.proxies['http'] 2243 urltype, proxyhost = splittype(proxy) 2244 proxyhost, proxyselector = splithost(proxyhost) 2245 i = proxyhost.find('@') + 1 2246 proxyhost = proxyhost[i:] 2247 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2248 if not (user or passwd): return None 2249 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2250 quote(passwd, safe=''), proxyhost) 2251 self.proxies['http'] = 'http://' + proxyhost + proxyselector 2252 if data is None: 2253 return self.open(newurl) 2254 else: 2255 return self.open(newurl, data) 2256 2257 def retry_proxy_https_basic_auth(self, url, realm, data=None): 2258 host, selector = splithost(url) 2259 newurl = 'https://' + host + selector 2260 proxy = self.proxies['https'] 2261 urltype, proxyhost = splittype(proxy) 2262 proxyhost, proxyselector = splithost(proxyhost) 2263 i = proxyhost.find('@') + 1 2264 proxyhost = proxyhost[i:] 2265 user, passwd = self.get_user_passwd(proxyhost, realm, i) 2266 if not (user or passwd): return None 2267 proxyhost = "%s:%s@%s" % (quote(user, safe=''), 2268 quote(passwd, safe=''), proxyhost) 2269 self.proxies['https'] = 'https://' + proxyhost + proxyselector 2270 if data is None: 2271 return self.open(newurl) 2272 else: 2273 return self.open(newurl, data) 2274 2275 def retry_http_basic_auth(self, url, realm, data=None): 2276 host, selector = splithost(url) 2277 i = host.find('@') + 1 2278 host = host[i:] 2279 user, passwd = self.get_user_passwd(host, realm, i) 2280 if not (user or passwd): return None 2281 host = "%s:%s@%s" % (quote(user, safe=''), 2282 quote(passwd, safe=''), host) 2283 newurl = 'http://' + host + selector 2284 if data is None: 2285 return self.open(newurl) 2286 else: 2287 return self.open(newurl, data) 2288 2289 def retry_https_basic_auth(self, url, realm, data=None): 2290 host, selector = splithost(url) 2291 i = host.find('@') + 1 2292 host = host[i:] 2293 user, passwd = self.get_user_passwd(host, realm, i) 2294 if not (user or passwd): return None 2295 host = "%s:%s@%s" % (quote(user, safe=''), 2296 quote(passwd, safe=''), host) 2297 newurl = 'https://' + host + selector 2298 if data is None: 2299 return self.open(newurl) 2300 else: 2301 return self.open(newurl, data) 2302 2303 def get_user_passwd(self, host, realm, clear_cache=0): 2304 key = realm + '@' + host.lower() 2305 if key in self.auth_cache: 2306 if clear_cache: 2307 del self.auth_cache[key] 2308 else: 2309 return self.auth_cache[key] 2310 user, passwd = self.prompt_user_passwd(host, realm) 2311 if user or passwd: self.auth_cache[key] = (user, passwd) 2312 return user, passwd 2313 2314 def prompt_user_passwd(self, host, realm): 2315 """Override this in a GUI environment!""" 2316 import getpass 2317 try: 2318 user = input("Enter username for %s at %s: " % (realm, host)) 2319 passwd = getpass.getpass("Enter password for %s in %s at %s: " % 2320 (user, realm, host)) 2321 return user, passwd 2322 except KeyboardInterrupt: 2323 print() 2324 return None, None 2325 2326 2327 # Utility functions 2328 2329 _localhost = None 2330 def localhost(): 2331 """Return the IP address of the magic hostname 'localhost'.""" 2332 global _localhost 2333 if _localhost is None: 2334 _localhost = socket.gethostbyname('localhost') 2335 return _localhost 2336 2337 _thishost = None 2338 def thishost(): 2339 """Return the IP addresses of the current host.""" 2340 global _thishost 2341 if _thishost is None: 2342 try: 2343 _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) 2344 except socket.gaierror: 2345 _thishost = tuple(socket.gethostbyname_ex('localhost')[2]) 2346 return _thishost 2347 2348 _ftperrors = None 2349 def ftperrors(): 2350 """Return the set of errors raised by the FTP class.""" 2351 global _ftperrors 2352 if _ftperrors is None: 2353 import ftplib 2354 _ftperrors = ftplib.all_errors 2355 return _ftperrors 2356 2357 _noheaders = None 2358 def noheaders(): 2359 """Return an empty email Message object.""" 2360 global _noheaders 2361 if _noheaders is None: 2362 _noheaders = email.message_from_string("") 2363 return _noheaders 2364 2365 2366 # Utility classes 2367 2368 class ftpwrapper: 2369 """Class used by open_ftp() for cache of open FTP connections.""" 2370 2371 def __init__(self, user, passwd, host, port, dirs, timeout=None, 2372 persistent=True): 2373 self.user = user 2374 self.passwd = passwd 2375 self.host = host 2376 self.port = port 2377 self.dirs = dirs 2378 self.timeout = timeout 2379 self.refcount = 0 2380 self.keepalive = persistent 2381 try: 2382 self.init() 2383 except: 2384 self.close() 2385 raise 2386 2387 def init(self): 2388 import ftplib 2389 self.busy = 0 2390 self.ftp = ftplib.FTP() 2391 self.ftp.connect(self.host, self.port, self.timeout) 2392 self.ftp.login(self.user, self.passwd) 2393 _target = '/'.join(self.dirs) 2394 self.ftp.cwd(_target) 2395 2396 def retrfile(self, file, type): 2397 import ftplib 2398 self.endtransfer() 2399 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 2400 else: cmd = 'TYPE ' + type; isdir = 0 2401 try: 2402 self.ftp.voidcmd(cmd) 2403 except ftplib.all_errors: 2404 self.init() 2405 self.ftp.voidcmd(cmd) 2406 conn = None 2407 if file and not isdir: 2408 # Try to retrieve as a file 2409 try: 2410 cmd = 'RETR ' + file 2411 conn, retrlen = self.ftp.ntransfercmd(cmd) 2412 except ftplib.error_perm as reason: 2413 if str(reason)[:3] != '550': 2414 raise URLError('ftp error: %r' % reason).with_traceback( 2415 sys.exc_info()[2]) 2416 if not conn: 2417 # Set transfer mode to ASCII! 2418 self.ftp.voidcmd('TYPE A') 2419 # Try a directory listing. Verify that directory exists. 2420 if file: 2421 pwd = self.ftp.pwd() 2422 try: 2423 try: 2424 self.ftp.cwd(file) 2425 except ftplib.error_perm as reason: 2426 raise URLError('ftp error: %r' % reason) from reason 2427 finally: 2428 self.ftp.cwd(pwd) 2429 cmd = 'LIST ' + file 2430 else: 2431 cmd = 'LIST' 2432 conn, retrlen = self.ftp.ntransfercmd(cmd) 2433 self.busy = 1 2434 2435 ftpobj = addclosehook(conn.makefile('rb'), self.file_close) 2436 self.refcount += 1 2437 conn.close() 2438 # Pass back both a suitably decorated object and a retrieval length 2439 return (ftpobj, retrlen) 2440 2441 def endtransfer(self): 2442 self.busy = 0 2443 2444 def close(self): 2445 self.keepalive = False 2446 if self.refcount <= 0: 2447 self.real_close() 2448 2449 def file_close(self): 2450 self.endtransfer() 2451 self.refcount -= 1 2452 if self.refcount <= 0 and not self.keepalive: 2453 self.real_close() 2454 2455 def real_close(self): 2456 self.endtransfer() 2457 try: 2458 self.ftp.close() 2459 except ftperrors(): 2460 pass 2461 2462 # Proxy handling 2463 def getproxies_environment(): 2464 """Return a dictionary of scheme -> proxy server URL mappings. 2465 2466 Scan the environment for variables named <scheme>_proxy; 2467 this seems to be the standard convention. If you need a 2468 different way, you can pass a proxies dictionary to the 2469 [Fancy]URLopener constructor. 2470 2471 """ 2472 proxies = {} 2473 # in order to prefer lowercase variables, process environment in 2474 # two passes: first matches any, second pass matches lowercase only 2475 for name, value in os.environ.items(): 2476 name = name.lower() 2477 if value and name[-6:] == '_proxy': 2478 proxies[name[:-6]] = value 2479 # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY 2480 # (non-all-lowercase) as it may be set from the web server by a "Proxy:" 2481 # header from the client 2482 # If "proxy" is lowercase, it will still be used thanks to the next block 2483 if 'REQUEST_METHOD' in os.environ: 2484 proxies.pop('http', None) 2485 for name, value in os.environ.items(): 2486 if name[-6:] == '_proxy': 2487 name = name.lower() 2488 if value: 2489 proxies[name[:-6]] = value 2490 else: 2491 proxies.pop(name[:-6], None) 2492 return proxies 2493 2494 def proxy_bypass_environment(host, proxies=None): 2495 """Test if proxies should not be used for a particular host. 2496 2497 Checks the proxy dict for the value of no_proxy, which should 2498 be a list of comma separated DNS suffixes, or '*' for all hosts. 2499 2500 """ 2501 if proxies is None: 2502 proxies = getproxies_environment() 2503 # don't bypass, if no_proxy isn't specified 2504 try: 2505 no_proxy = proxies['no'] 2506 except KeyError: 2507 return 0 2508 # '*' is special case for always bypass 2509 if no_proxy == '*': 2510 return 1 2511 # strip port off host 2512 hostonly, port = splitport(host) 2513 # check if the host ends with any of the DNS suffixes 2514 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')] 2515 for name in no_proxy_list: 2516 if name: 2517 name = name.lstrip('.') # ignore leading dots 2518 name = re.escape(name) 2519 pattern = r'(.+\.)?%s$' % name 2520 if (re.match(pattern, hostonly, re.I) 2521 or re.match(pattern, host, re.I)): 2522 return 1 2523 # otherwise, don't bypass 2524 return 0 2525 2526 2527 # This code tests an OSX specific data structure but is testable on all 2528 # platforms 2529 def _proxy_bypass_macosx_sysconf(host, proxy_settings): 2530 """ 2531 Return True iff this host shouldn't be accessed using a proxy 2532 2533 This function uses the MacOSX framework SystemConfiguration 2534 to fetch the proxy information. 2535 2536 proxy_settings come from _scproxy._get_proxy_settings or get mocked ie: 2537 { 'exclude_simple': bool, 2538 'exceptions': ['foo.bar', '*.bar.com', '127.0.0.1', '10.1', '10.0/16'] 2539 } 2540 """ 2541 from fnmatch import fnmatch 2542 2543 hostonly, port = splitport(host) 2544 2545 def ip2num(ipAddr): 2546 parts = ipAddr.split('.') 2547 parts = list(map(int, parts)) 2548 if len(parts) != 4: 2549 parts = (parts + [0, 0, 0, 0])[:4] 2550 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] 2551 2552 # Check for simple host names: 2553 if '.' not in host: 2554 if proxy_settings['exclude_simple']: 2555 return True 2556 2557 hostIP = None 2558 2559 for value in proxy_settings.get('exceptions', ()): 2560 # Items in the list are strings like these: *.local, 169.254/16 2561 if not value: continue 2562 2563 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) 2564 if m is not None: 2565 if hostIP is None: 2566 try: 2567 hostIP = socket.gethostbyname(hostonly) 2568 hostIP = ip2num(hostIP) 2569 except OSError: 2570 continue 2571 2572 base = ip2num(m.group(1)) 2573 mask = m.group(2) 2574 if mask is None: 2575 mask = 8 * (m.group(1).count('.') + 1) 2576 else: 2577 mask = int(mask[1:]) 2578 mask = 32 - mask 2579 2580 if (hostIP >> mask) == (base >> mask): 2581 return True 2582 2583 elif fnmatch(host, value): 2584 return True 2585 2586 return False 2587 2588 2589 if sys.platform == 'darwin': 2590 from _scproxy import _get_proxy_settings, _get_proxies 2591 2592 def proxy_bypass_macosx_sysconf(host): 2593 proxy_settings = _get_proxy_settings() 2594 return _proxy_bypass_macosx_sysconf(host, proxy_settings) 2595 2596 def getproxies_macosx_sysconf(): 2597 """Return a dictionary of scheme -> proxy server URL mappings. 2598 2599 This function uses the MacOSX framework SystemConfiguration 2600 to fetch the proxy information. 2601 """ 2602 return _get_proxies() 2603 2604 2605 2606 def proxy_bypass(host): 2607 """Return True, if host should be bypassed. 2608 2609 Checks proxy settings gathered from the environment, if specified, 2610 or from the MacOSX framework SystemConfiguration. 2611 2612 """ 2613 proxies = getproxies_environment() 2614 if proxies: 2615 return proxy_bypass_environment(host, proxies) 2616 else: 2617 return proxy_bypass_macosx_sysconf(host) 2618 2619 def getproxies(): 2620 return getproxies_environment() or getproxies_macosx_sysconf() 2621 2622 2623 elif os.name == 'nt': 2624 def getproxies_registry(): 2625 """Return a dictionary of scheme -> proxy server URL mappings. 2626 2627 Win32 uses the registry to store proxies. 2628 2629 """ 2630 proxies = {} 2631 try: 2632 import winreg 2633 except ImportError: 2634 # Std module, so should be around - but you never know! 2635 return proxies 2636 try: 2637 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2638 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2639 proxyEnable = winreg.QueryValueEx(internetSettings, 2640 'ProxyEnable')[0] 2641 if proxyEnable: 2642 # Returned as Unicode but problems if not converted to ASCII 2643 proxyServer = str(winreg.QueryValueEx(internetSettings, 2644 'ProxyServer')[0]) 2645 if '=' in proxyServer: 2646 # Per-protocol settings 2647 for p in proxyServer.split(';'): 2648 protocol, address = p.split('=', 1) 2649 # See if address has a type:// prefix 2650 if not re.match('^([^/:]+)://', address): 2651 address = '%s://%s' % (protocol, address) 2652 proxies[protocol] = address 2653 else: 2654 # Use one setting for all protocols 2655 if proxyServer[:5] == 'http:': 2656 proxies['http'] = proxyServer 2657 else: 2658 proxies['http'] = 'http://%s' % proxyServer 2659 proxies['https'] = 'https://%s' % proxyServer 2660 proxies['ftp'] = 'ftp://%s' % proxyServer 2661 internetSettings.Close() 2662 except (OSError, ValueError, TypeError): 2663 # Either registry key not found etc, or the value in an 2664 # unexpected format. 2665 # proxies already set up to be empty so nothing to do 2666 pass 2667 return proxies 2668 2669 def getproxies(): 2670 """Return a dictionary of scheme -> proxy server URL mappings. 2671 2672 Returns settings gathered from the environment, if specified, 2673 or the registry. 2674 2675 """ 2676 return getproxies_environment() or getproxies_registry() 2677 2678 def proxy_bypass_registry(host): 2679 try: 2680 import winreg 2681 except ImportError: 2682 # Std modules, so should be around - but you never know! 2683 return 0 2684 try: 2685 internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, 2686 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') 2687 proxyEnable = winreg.QueryValueEx(internetSettings, 2688 'ProxyEnable')[0] 2689 proxyOverride = str(winreg.QueryValueEx(internetSettings, 2690 'ProxyOverride')[0]) 2691 # ^^^^ Returned as Unicode but problems if not converted to ASCII 2692 except OSError: 2693 return 0 2694 if not proxyEnable or not proxyOverride: 2695 return 0 2696 # try to make a host list from name and IP address. 2697 rawHost, port = splitport(host) 2698 host = [rawHost] 2699 try: 2700 addr = socket.gethostbyname(rawHost) 2701 if addr != rawHost: 2702 host.append(addr) 2703 except OSError: 2704 pass 2705 try: 2706 fqdn = socket.getfqdn(rawHost) 2707 if fqdn != rawHost: 2708 host.append(fqdn) 2709 except OSError: 2710 pass 2711 # make a check value list from the registry entry: replace the 2712 # '<local>' string by the localhost entry and the corresponding 2713 # canonical entry. 2714 proxyOverride = proxyOverride.split(';') 2715 # now check if we match one of the registry values. 2716 for test in proxyOverride: 2717 if test == '<local>': 2718 if '.' not in rawHost: 2719 return 1 2720 test = test.replace(".", r"\.") # mask dots 2721 test = test.replace("*", r".*") # change glob sequence 2722 test = test.replace("?", r".") # change glob char 2723 for val in host: 2724 if re.match(test, val, re.I): 2725 return 1 2726 return 0 2727 2728 def proxy_bypass(host): 2729 """Return True, if host should be bypassed. 2730 2731 Checks proxy settings gathered from the environment, if specified, 2732 or the registry. 2733 2734 """ 2735 proxies = getproxies_environment() 2736 if proxies: 2737 return proxy_bypass_environment(host, proxies) 2738 else: 2739 return proxy_bypass_registry(host) 2740 2741 else: 2742 # By default use environment variables 2743 getproxies = getproxies_environment 2744 proxy_bypass = proxy_bypass_environment 2745