Home | History | Annotate | Download | only in urllib
      1 """Parse (absolute and relative) URLs.
      2 
      3 urlparse module is based upon the following RFC specifications.
      4 
      5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
      6 and L.  Masinter, January 2005.
      7 
      8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
      9 and L.Masinter, December 1999.
     10 
     11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
     12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
     13 
     14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
     15 
     16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
     17 1995.
     18 
     19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
     20 McCahill, December 1994
     21 
     22 RFC 3986 is considered the current standard and any future changes to
     23 urlparse module should conform with it.  The urlparse module is
     24 currently not entirely compliant with this RFC due to defacto
     25 scenarios for parsing, and for backward compatibility purposes, some
     26 parsing quirks from older RFCs are retained. The testcases in
     27 test_urlparse.py provides a good indicator of parsing behavior.
     28 """
     29 
     30 import re
     31 import sys
     32 import collections
     33 
     34 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
     35            "urlsplit", "urlunsplit", "urlencode", "parse_qs",
     36            "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
     37            "unquote", "unquote_plus", "unquote_to_bytes",
     38            "DefragResult", "ParseResult", "SplitResult",
     39            "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
     40 
     41 # A classification of schemes ('' means apply by default)
     42 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
     43                  'wais', 'file', 'https', 'shttp', 'mms',
     44                  'prospero', 'rtsp', 'rtspu', '', 'sftp',
     45                  'svn', 'svn+ssh', 'ws', 'wss']
     46 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
     47                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
     48                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
     49                'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
     50                'ws', 'wss']
     51 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
     52                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
     53                'mms', '', 'sftp', 'tel']
     54 
     55 # These are not actually used anymore, but should stay for backwards
     56 # compatibility.  (They are undocumented, but have a public-looking name.)
     57 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
     58                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
     59 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
     60               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
     61 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
     62                  'nntp', 'wais', 'https', 'shttp', 'snews',
     63                  'file', 'prospero', '']
     64 
     65 # Characters valid in scheme names
     66 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
     67                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     68                 '0123456789'
     69                 '+-.')
     70 
     71 # XXX: Consider replacing with functools.lru_cache
     72 MAX_CACHE_SIZE = 20
     73 _parse_cache = {}
     74 
     75 def clear_cache():
     76     """Clear the parse cache and the quoters cache."""
     77     _parse_cache.clear()
     78     _safe_quoters.clear()
     79 
     80 
     81 # Helpers for bytes handling
     82 # For 3.2, we deliberately require applications that
     83 # handle improperly quoted URLs to do their own
     84 # decoding and encoding. If valid use cases are
     85 # presented, we may relax this by using latin-1
     86 # decoding internally for 3.3
     87 _implicit_encoding = 'ascii'
     88 _implicit_errors = 'strict'
     89 
     90 def _noop(obj):
     91     return obj
     92 
     93 def _encode_result(obj, encoding=_implicit_encoding,
     94                         errors=_implicit_errors):
     95     return obj.encode(encoding, errors)
     96 
     97 def _decode_args(args, encoding=_implicit_encoding,
     98                        errors=_implicit_errors):
     99     return tuple(x.decode(encoding, errors) if x else '' for x in args)
    100 
    101 def _coerce_args(*args):
    102     # Invokes decode if necessary to create str args
    103     # and returns the coerced inputs along with
    104     # an appropriate result coercion function
    105     #   - noop for str inputs
    106     #   - encoding function otherwise
    107     str_input = isinstance(args[0], str)
    108     for arg in args[1:]:
    109         # We special-case the empty string to support the
    110         # "scheme=''" default argument to some functions
    111         if arg and isinstance(arg, str) != str_input:
    112             raise TypeError("Cannot mix str and non-str arguments")
    113     if str_input:
    114         return args + (_noop,)
    115     return _decode_args(args) + (_encode_result,)
    116 
    117 # Result objects are more helpful than simple tuples
    118 class _ResultMixinStr(object):
    119     """Standard approach to encoding parsed results from str to bytes"""
    120     __slots__ = ()
    121 
    122     def encode(self, encoding='ascii', errors='strict'):
    123         return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
    124 
    125 
    126 class _ResultMixinBytes(object):
    127     """Standard approach to decoding parsed results from bytes to str"""
    128     __slots__ = ()
    129 
    130     def decode(self, encoding='ascii', errors='strict'):
    131         return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
    132 
    133 
    134 class _NetlocResultMixinBase(object):
    135     """Shared methods for the parsed result objects containing a netloc element"""
    136     __slots__ = ()
    137 
    138     @property
    139     def username(self):
    140         return self._userinfo[0]
    141 
    142     @property
    143     def password(self):
    144         return self._userinfo[1]
    145 
    146     @property
    147     def hostname(self):
    148         hostname = self._hostinfo[0]
    149         if not hostname:
    150             hostname = None
    151         elif hostname is not None:
    152             hostname = hostname.lower()
    153         return hostname
    154 
    155     @property
    156     def port(self):
    157         port = self._hostinfo[1]
    158         if port is not None:
    159             port = int(port, 10)
    160             if not ( 0 <= port <= 65535):
    161                 raise ValueError("Port out of range 0-65535")
    162         return port
    163 
    164 
    165 class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
    166     __slots__ = ()
    167 
    168     @property
    169     def _userinfo(self):
    170         netloc = self.netloc
    171         userinfo, have_info, hostinfo = netloc.rpartition('@')
    172         if have_info:
    173             username, have_password, password = userinfo.partition(':')
    174             if not have_password:
    175                 password = None
    176         else:
    177             username = password = None
    178         return username, password
    179 
    180     @property
    181     def _hostinfo(self):
    182         netloc = self.netloc
    183         _, _, hostinfo = netloc.rpartition('@')
    184         _, have_open_br, bracketed = hostinfo.partition('[')
    185         if have_open_br:
    186             hostname, _, port = bracketed.partition(']')
    187             _, _, port = port.partition(':')
    188         else:
    189             hostname, _, port = hostinfo.partition(':')
    190         if not port:
    191             port = None
    192         return hostname, port
    193 
    194 
    195 class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
    196     __slots__ = ()
    197 
    198     @property
    199     def _userinfo(self):
    200         netloc = self.netloc
    201         userinfo, have_info, hostinfo = netloc.rpartition(b'@')
    202         if have_info:
    203             username, have_password, password = userinfo.partition(b':')
    204             if not have_password:
    205                 password = None
    206         else:
    207             username = password = None
    208         return username, password
    209 
    210     @property
    211     def _hostinfo(self):
    212         netloc = self.netloc
    213         _, _, hostinfo = netloc.rpartition(b'@')
    214         _, have_open_br, bracketed = hostinfo.partition(b'[')
    215         if have_open_br:
    216             hostname, _, port = bracketed.partition(b']')
    217             _, _, port = port.partition(b':')
    218         else:
    219             hostname, _, port = hostinfo.partition(b':')
    220         if not port:
    221             port = None
    222         return hostname, port
    223 
    224 
    225 from collections import namedtuple
    226 
    227 _DefragResultBase = namedtuple('DefragResult', 'url fragment')
    228 _SplitResultBase = namedtuple(
    229     'SplitResult', 'scheme netloc path query fragment')
    230 _ParseResultBase = namedtuple(
    231     'ParseResult', 'scheme netloc path params query fragment')
    232 
    233 _DefragResultBase.__doc__ = """
    234 DefragResult(url, fragment)
    235 
    236 A 2-tuple that contains the url without fragment identifier and the fragment
    237 identifier as a separate argument.
    238 """
    239 
    240 _DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
    241 
    242 _DefragResultBase.fragment.__doc__ = """
    243 Fragment identifier separated from URL, that allows indirect identification of a
    244 secondary resource by reference to a primary resource and additional identifying
    245 information.
    246 """
    247 
    248 _SplitResultBase.__doc__ = """
    249 SplitResult(scheme, netloc, path, query, fragment)
    250 
    251 A 5-tuple that contains the different components of a URL. Similar to
    252 ParseResult, but does not split params.
    253 """
    254 
    255 _SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
    256 
    257 _SplitResultBase.netloc.__doc__ = """
    258 Network location where the request is made to.
    259 """
    260 
    261 _SplitResultBase.path.__doc__ = """
    262 The hierarchical path, such as the path to a file to download.
    263 """
    264 
    265 _SplitResultBase.query.__doc__ = """
    266 The query component, that contains non-hierarchical data, that along with data
    267 in path component, identifies a resource in the scope of URI's scheme and
    268 network location.
    269 """
    270 
    271 _SplitResultBase.fragment.__doc__ = """
    272 Fragment identifier, that allows indirect identification of a secondary resource
    273 by reference to a primary resource and additional identifying information.
    274 """
    275 
    276 _ParseResultBase.__doc__ = """
    277 ParseResult(scheme, netloc, path, params,  query, fragment)
    278 
    279 A 6-tuple that contains components of a parsed URL.
    280 """
    281 
    282 _ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
    283 _ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
    284 _ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
    285 _ParseResultBase.params.__doc__ = """
    286 Parameters for last path element used to dereference the URI in order to provide
    287 access to perform some operation on the resource.
    288 """
    289 
    290 _ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
    291 _ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
    292 
    293 
    294 # For backwards compatibility, alias _NetlocResultMixinStr
    295 # ResultBase is no longer part of the documented API, but it is
    296 # retained since deprecating it isn't worth the hassle
    297 ResultBase = _NetlocResultMixinStr
    298 
    299 # Structured result objects for string data
    300 class DefragResult(_DefragResultBase, _ResultMixinStr):
    301     __slots__ = ()
    302     def geturl(self):
    303         if self.fragment:
    304             return self.url + '#' + self.fragment
    305         else:
    306             return self.url
    307 
    308 class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
    309     __slots__ = ()
    310     def geturl(self):
    311         return urlunsplit(self)
    312 
    313 class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
    314     __slots__ = ()
    315     def geturl(self):
    316         return urlunparse(self)
    317 
    318 # Structured result objects for bytes data
    319 class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
    320     __slots__ = ()
    321     def geturl(self):
    322         if self.fragment:
    323             return self.url + b'#' + self.fragment
    324         else:
    325             return self.url
    326 
    327 class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
    328     __slots__ = ()
    329     def geturl(self):
    330         return urlunsplit(self)
    331 
    332 class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
    333     __slots__ = ()
    334     def geturl(self):
    335         return urlunparse(self)
    336 
    337 # Set up the encode/decode result pairs
    338 def _fix_result_transcoding():
    339     _result_pairs = (
    340         (DefragResult, DefragResultBytes),
    341         (SplitResult, SplitResultBytes),
    342         (ParseResult, ParseResultBytes),
    343     )
    344     for _decoded, _encoded in _result_pairs:
    345         _decoded._encoded_counterpart = _encoded
    346         _encoded._decoded_counterpart = _decoded
    347 
    348 _fix_result_transcoding()
    349 del _fix_result_transcoding
    350 
    351 def urlparse(url, scheme='', allow_fragments=True):
    352     """Parse a URL into 6 components:
    353     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    354     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    355     Note that we don't break the components up in smaller bits
    356     (e.g. netloc is a single string) and we don't expand % escapes."""
    357     url, scheme, _coerce_result = _coerce_args(url, scheme)
    358     splitresult = urlsplit(url, scheme, allow_fragments)
    359     scheme, netloc, url, query, fragment = splitresult
    360     if scheme in uses_params and ';' in url:
    361         url, params = _splitparams(url)
    362     else:
    363         params = ''
    364     result = ParseResult(scheme, netloc, url, params, query, fragment)
    365     return _coerce_result(result)
    366 
    367 def _splitparams(url):
    368     if '/'  in url:
    369         i = url.find(';', url.rfind('/'))
    370         if i < 0:
    371             return url, ''
    372     else:
    373         i = url.find(';')
    374     return url[:i], url[i+1:]
    375 
    376 def _splitnetloc(url, start=0):
    377     delim = len(url)   # position of end of domain part of url, default is end
    378     for c in '/?#':    # look for delimiters; the order is NOT important
    379         wdelim = url.find(c, start)        # find first of this delim
    380         if wdelim >= 0:                    # if found
    381             delim = min(delim, wdelim)     # use earliest delim position
    382     return url[start:delim], url[delim:]   # return (domain, rest)
    383 
    384 def urlsplit(url, scheme='', allow_fragments=True):
    385     """Parse a URL into 5 components:
    386     <scheme>://<netloc>/<path>?<query>#<fragment>
    387     Return a 5-tuple: (scheme, netloc, path, query, fragment).
    388     Note that we don't break the components up in smaller bits
    389     (e.g. netloc is a single string) and we don't expand % escapes."""
    390     url, scheme, _coerce_result = _coerce_args(url, scheme)
    391     allow_fragments = bool(allow_fragments)
    392     key = url, scheme, allow_fragments, type(url), type(scheme)
    393     cached = _parse_cache.get(key, None)
    394     if cached:
    395         return _coerce_result(cached)
    396     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
    397         clear_cache()
    398     netloc = query = fragment = ''
    399     i = url.find(':')
    400     if i > 0:
    401         if url[:i] == 'http': # optimize the common case
    402             scheme = url[:i].lower()
    403             url = url[i+1:]
    404             if url[:2] == '//':
    405                 netloc, url = _splitnetloc(url, 2)
    406                 if (('[' in netloc and ']' not in netloc) or
    407                         (']' in netloc and '[' not in netloc)):
    408                     raise ValueError("Invalid IPv6 URL")
    409             if allow_fragments and '#' in url:
    410                 url, fragment = url.split('#', 1)
    411             if '?' in url:
    412                 url, query = url.split('?', 1)
    413             v = SplitResult(scheme, netloc, url, query, fragment)
    414             _parse_cache[key] = v
    415             return _coerce_result(v)
    416         for c in url[:i]:
    417             if c not in scheme_chars:
    418                 break
    419         else:
    420             # make sure "url" is not actually a port number (in which case
    421             # "scheme" is really part of the path)
    422             rest = url[i+1:]
    423             if not rest or any(c not in '0123456789' for c in rest):
    424                 # not a port number
    425                 scheme, url = url[:i].lower(), rest
    426 
    427     if url[:2] == '//':
    428         netloc, url = _splitnetloc(url, 2)
    429         if (('[' in netloc and ']' not in netloc) or
    430                 (']' in netloc and '[' not in netloc)):
    431             raise ValueError("Invalid IPv6 URL")
    432     if allow_fragments and '#' in url:
    433         url, fragment = url.split('#', 1)
    434     if '?' in url:
    435         url, query = url.split('?', 1)
    436     v = SplitResult(scheme, netloc, url, query, fragment)
    437     _parse_cache[key] = v
    438     return _coerce_result(v)
    439 
    440 def urlunparse(components):
    441     """Put a parsed URL back together again.  This may result in a
    442     slightly different, but equivalent URL, if the URL that was parsed
    443     originally had redundant delimiters, e.g. a ? with an empty query
    444     (the draft states that these are equivalent)."""
    445     scheme, netloc, url, params, query, fragment, _coerce_result = (
    446                                                   _coerce_args(*components))
    447     if params:
    448         url = "%s;%s" % (url, params)
    449     return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
    450 
    451 def urlunsplit(components):
    452     """Combine the elements of a tuple as returned by urlsplit() into a
    453     complete URL as a string. The data argument can be any five-item iterable.
    454     This may result in a slightly different, but equivalent URL, if the URL that
    455     was parsed originally had unnecessary delimiters (for example, a ? with an
    456     empty query; the RFC states that these are equivalent)."""
    457     scheme, netloc, url, query, fragment, _coerce_result = (
    458                                           _coerce_args(*components))
    459     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
    460         if url and url[:1] != '/': url = '/' + url
    461         url = '//' + (netloc or '') + url
    462     if scheme:
    463         url = scheme + ':' + url
    464     if query:
    465         url = url + '?' + query
    466     if fragment:
    467         url = url + '#' + fragment
    468     return _coerce_result(url)
    469 
    470 def urljoin(base, url, allow_fragments=True):
    471     """Join a base URL and a possibly relative URL to form an absolute
    472     interpretation of the latter."""
    473     if not base:
    474         return url
    475     if not url:
    476         return base
    477 
    478     base, url, _coerce_result = _coerce_args(base, url)
    479     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
    480             urlparse(base, '', allow_fragments)
    481     scheme, netloc, path, params, query, fragment = \
    482             urlparse(url, bscheme, allow_fragments)
    483 
    484     if scheme != bscheme or scheme not in uses_relative:
    485         return _coerce_result(url)
    486     if scheme in uses_netloc:
    487         if netloc:
    488             return _coerce_result(urlunparse((scheme, netloc, path,
    489                                               params, query, fragment)))
    490         netloc = bnetloc
    491 
    492     if not path and not params:
    493         path = bpath
    494         params = bparams
    495         if not query:
    496             query = bquery
    497         return _coerce_result(urlunparse((scheme, netloc, path,
    498                                           params, query, fragment)))
    499 
    500     base_parts = bpath.split('/')
    501     if base_parts[-1] != '':
    502         # the last item is not a directory, so will not be taken into account
    503         # in resolving the relative path
    504         del base_parts[-1]
    505 
    506     # for rfc3986, ignore all base path should the first character be root.
    507     if path[:1] == '/':
    508         segments = path.split('/')
    509     else:
    510         segments = base_parts + path.split('/')
    511         # filter out elements that would cause redundant slashes on re-joining
    512         # the resolved_path
    513         segments[1:-1] = filter(None, segments[1:-1])
    514 
    515     resolved_path = []
    516 
    517     for seg in segments:
    518         if seg == '..':
    519             try:
    520                 resolved_path.pop()
    521             except IndexError:
    522                 # ignore any .. segments that would otherwise cause an IndexError
    523                 # when popped from resolved_path if resolving for rfc3986
    524                 pass
    525         elif seg == '.':
    526             continue
    527         else:
    528             resolved_path.append(seg)
    529 
    530     if segments[-1] in ('.', '..'):
    531         # do some post-processing here. if the last segment was a relative dir,
    532         # then we need to append the trailing '/'
    533         resolved_path.append('')
    534 
    535     return _coerce_result(urlunparse((scheme, netloc, '/'.join(
    536         resolved_path) or '/', params, query, fragment)))
    537 
    538 
    539 def urldefrag(url):
    540     """Removes any existing fragment from URL.
    541 
    542     Returns a tuple of the defragmented URL and the fragment.  If
    543     the URL contained no fragments, the second element is the
    544     empty string.
    545     """
    546     url, _coerce_result = _coerce_args(url)
    547     if '#' in url:
    548         s, n, p, a, q, frag = urlparse(url)
    549         defrag = urlunparse((s, n, p, a, q, ''))
    550     else:
    551         frag = ''
    552         defrag = url
    553     return _coerce_result(DefragResult(defrag, frag))
    554 
    555 _hexdig = '0123456789ABCDEFabcdef'
    556 _hextobyte = None
    557 
    558 def unquote_to_bytes(string):
    559     """unquote_to_bytes('abc%20def') -> b'abc def'."""
    560     # Note: strings are encoded as UTF-8. This is only an issue if it contains
    561     # unescaped non-ASCII characters, which URIs should not.
    562     if not string:
    563         # Is it a string-like object?
    564         string.split
    565         return b''
    566     if isinstance(string, str):
    567         string = string.encode('utf-8')
    568     bits = string.split(b'%')
    569     if len(bits) == 1:
    570         return string
    571     res = [bits[0]]
    572     append = res.append
    573     # Delay the initialization of the table to not waste memory
    574     # if the function is never called
    575     global _hextobyte
    576     if _hextobyte is None:
    577         _hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
    578                       for a in _hexdig for b in _hexdig}
    579     for item in bits[1:]:
    580         try:
    581             append(_hextobyte[item[:2]])
    582             append(item[2:])
    583         except KeyError:
    584             append(b'%')
    585             append(item)
    586     return b''.join(res)
    587 
    588 _asciire = re.compile('([\x00-\x7f]+)')
    589 
    590 def unquote(string, encoding='utf-8', errors='replace'):
    591     """Replace %xx escapes by their single-character equivalent. The optional
    592     encoding and errors parameters specify how to decode percent-encoded
    593     sequences into Unicode characters, as accepted by the bytes.decode()
    594     method.
    595     By default, percent-encoded sequences are decoded with UTF-8, and invalid
    596     sequences are replaced by a placeholder character.
    597 
    598     unquote('abc%20def') -> 'abc def'.
    599     """
    600     if '%' not in string:
    601         string.split
    602         return string
    603     if encoding is None:
    604         encoding = 'utf-8'
    605     if errors is None:
    606         errors = 'replace'
    607     bits = _asciire.split(string)
    608     res = [bits[0]]
    609     append = res.append
    610     for i in range(1, len(bits), 2):
    611         append(unquote_to_bytes(bits[i]).decode(encoding, errors))
    612         append(bits[i + 1])
    613     return ''.join(res)
    614 
    615 def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
    616              encoding='utf-8', errors='replace'):
    617     """Parse a query given as a string argument.
    618 
    619         Arguments:
    620 
    621         qs: percent-encoded query string to be parsed
    622 
    623         keep_blank_values: flag indicating whether blank values in
    624             percent-encoded queries should be treated as blank strings.
    625             A true value indicates that blanks should be retained as
    626             blank strings.  The default false value indicates that
    627             blank values are to be ignored and treated as if they were
    628             not included.
    629 
    630         strict_parsing: flag indicating what to do with parsing errors.
    631             If false (the default), errors are silently ignored.
    632             If true, errors raise a ValueError exception.
    633 
    634         encoding and errors: specify how to decode percent-encoded sequences
    635             into Unicode characters, as accepted by the bytes.decode() method.
    636     """
    637     parsed_result = {}
    638     pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
    639                       encoding=encoding, errors=errors)
    640     for name, value in pairs:
    641         if name in parsed_result:
    642             parsed_result[name].append(value)
    643         else:
    644             parsed_result[name] = [value]
    645     return parsed_result
    646 
    647 def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
    648               encoding='utf-8', errors='replace'):
    649     """Parse a query given as a string argument.
    650 
    651     Arguments:
    652 
    653     qs: percent-encoded query string to be parsed
    654 
    655     keep_blank_values: flag indicating whether blank values in
    656         percent-encoded queries should be treated as blank strings.  A
    657         true value indicates that blanks should be retained as blank
    658         strings.  The default false value indicates that blank values
    659         are to be ignored and treated as if they were  not included.
    660 
    661     strict_parsing: flag indicating what to do with parsing errors. If
    662         false (the default), errors are silently ignored. If true,
    663         errors raise a ValueError exception.
    664 
    665     encoding and errors: specify how to decode percent-encoded sequences
    666         into Unicode characters, as accepted by the bytes.decode() method.
    667 
    668     Returns a list, as G-d intended.
    669     """
    670     qs, _coerce_result = _coerce_args(qs)
    671     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    672     r = []
    673     for name_value in pairs:
    674         if not name_value and not strict_parsing:
    675             continue
    676         nv = name_value.split('=', 1)
    677         if len(nv) != 2:
    678             if strict_parsing:
    679                 raise ValueError("bad query field: %r" % (name_value,))
    680             # Handle case of a control-name with no equal sign
    681             if keep_blank_values:
    682                 nv.append('')
    683             else:
    684                 continue
    685         if len(nv[1]) or keep_blank_values:
    686             name = nv[0].replace('+', ' ')
    687             name = unquote(name, encoding=encoding, errors=errors)
    688             name = _coerce_result(name)
    689             value = nv[1].replace('+', ' ')
    690             value = unquote(value, encoding=encoding, errors=errors)
    691             value = _coerce_result(value)
    692             r.append((name, value))
    693     return r
    694 
    695 def unquote_plus(string, encoding='utf-8', errors='replace'):
    696     """Like unquote(), but also replace plus signs by spaces, as required for
    697     unquoting HTML form values.
    698 
    699     unquote_plus('%7e/abc+def') -> '~/abc def'
    700     """
    701     string = string.replace('+', ' ')
    702     return unquote(string, encoding, errors)
    703 
    704 _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    705                          b'abcdefghijklmnopqrstuvwxyz'
    706                          b'0123456789'
    707                          b'_.-')
    708 _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
    709 _safe_quoters = {}
    710 
    711 class Quoter(collections.defaultdict):
    712     """A mapping from bytes (in range(0,256)) to strings.
    713 
    714     String values are percent-encoded byte values, unless the key < 128, and
    715     in the "safe" set (either the specified safe set, or default set).
    716     """
    717     # Keeps a cache internally, using defaultdict, for efficiency (lookups
    718     # of cached keys don't call Python code at all).
    719     def __init__(self, safe):
    720         """safe: bytes object."""
    721         self.safe = _ALWAYS_SAFE.union(safe)
    722 
    723     def __repr__(self):
    724         # Without this, will just display as a defaultdict
    725         return "<%s %r>" % (self.__class__.__name__, dict(self))
    726 
    727     def __missing__(self, b):
    728         # Handle a cache miss. Store quoted string in cache and return.
    729         res = chr(b) if b in self.safe else '%{:02X}'.format(b)
    730         self[b] = res
    731         return res
    732 
    733 def quote(string, safe='/', encoding=None, errors=None):
    734     """quote('abc def') -> 'abc%20def'
    735 
    736     Each part of a URL, e.g. the path info, the query, etc., has a
    737     different set of reserved characters that must be quoted.
    738 
    739     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
    740     the following reserved characters.
    741 
    742     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
    743                   "$" | ","
    744 
    745     Each of these characters is reserved in some component of a URL,
    746     but not necessarily in all of them.
    747 
    748     By default, the quote function is intended for quoting the path
    749     section of a URL.  Thus, it will not encode '/'.  This character
    750     is reserved, but in typical usage the quote function is being
    751     called on a path where the existing slash characters are used as
    752     reserved characters.
    753 
    754     string and safe may be either str or bytes objects. encoding and errors
    755     must not be specified if string is a bytes object.
    756 
    757     The optional encoding and errors parameters specify how to deal with
    758     non-ASCII characters, as accepted by the str.encode method.
    759     By default, encoding='utf-8' (characters are encoded with UTF-8), and
    760     errors='strict' (unsupported characters raise a UnicodeEncodeError).
    761     """
    762     if isinstance(string, str):
    763         if not string:
    764             return string
    765         if encoding is None:
    766             encoding = 'utf-8'
    767         if errors is None:
    768             errors = 'strict'
    769         string = string.encode(encoding, errors)
    770     else:
    771         if encoding is not None:
    772             raise TypeError("quote() doesn't support 'encoding' for bytes")
    773         if errors is not None:
    774             raise TypeError("quote() doesn't support 'errors' for bytes")
    775     return quote_from_bytes(string, safe)
    776 
    777 def quote_plus(string, safe='', encoding=None, errors=None):
    778     """Like quote(), but also replace ' ' with '+', as required for quoting
    779     HTML form values. Plus signs in the original string are escaped unless
    780     they are included in safe. It also does not have safe default to '/'.
    781     """
    782     # Check if ' ' in string, where string may either be a str or bytes.  If
    783     # there are no spaces, the regular quote will produce the right answer.
    784     if ((isinstance(string, str) and ' ' not in string) or
    785         (isinstance(string, bytes) and b' ' not in string)):
    786         return quote(string, safe, encoding, errors)
    787     if isinstance(safe, str):
    788         space = ' '
    789     else:
    790         space = b' '
    791     string = quote(string, safe + space, encoding, errors)
    792     return string.replace(' ', '+')
    793 
    794 def quote_from_bytes(bs, safe='/'):
    795     """Like quote(), but accepts a bytes object rather than a str, and does
    796     not perform string-to-bytes encoding.  It always returns an ASCII string.
    797     quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
    798     """
    799     if not isinstance(bs, (bytes, bytearray)):
    800         raise TypeError("quote_from_bytes() expected bytes")
    801     if not bs:
    802         return ''
    803     if isinstance(safe, str):
    804         # Normalize 'safe' by converting to bytes and removing non-ASCII chars
    805         safe = safe.encode('ascii', 'ignore')
    806     else:
    807         safe = bytes([c for c in safe if c < 128])
    808     if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
    809         return bs.decode()
    810     try:
    811         quoter = _safe_quoters[safe]
    812     except KeyError:
    813         _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
    814     return ''.join([quoter(char) for char in bs])
    815 
    816 def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
    817               quote_via=quote_plus):
    818     """Encode a dict or sequence of two-element tuples into a URL query string.
    819 
    820     If any values in the query arg are sequences and doseq is true, each
    821     sequence element is converted to a separate parameter.
    822 
    823     If the query arg is a sequence of two-element tuples, the order of the
    824     parameters in the output will match the order of parameters in the
    825     input.
    826 
    827     The components of a query arg may each be either a string or a bytes type.
    828 
    829     The safe, encoding, and errors parameters are passed down to the function
    830     specified by quote_via (encoding and errors only if a component is a str).
    831     """
    832 
    833     if hasattr(query, "items"):
    834         query = query.items()
    835     else:
    836         # It's a bother at times that strings and string-like objects are
    837         # sequences.
    838         try:
    839             # non-sequence items should not work with len()
    840             # non-empty strings will fail this
    841             if len(query) and not isinstance(query[0], tuple):
    842                 raise TypeError
    843             # Zero-length sequences of all types will get here and succeed,
    844             # but that's a minor nit.  Since the original implementation
    845             # allowed empty dicts that type of behavior probably should be
    846             # preserved for consistency
    847         except TypeError:
    848             ty, va, tb = sys.exc_info()
    849             raise TypeError("not a valid non-string sequence "
    850                             "or mapping object").with_traceback(tb)
    851 
    852     l = []
    853     if not doseq:
    854         for k, v in query:
    855             if isinstance(k, bytes):
    856                 k = quote_via(k, safe)
    857             else:
    858                 k = quote_via(str(k), safe, encoding, errors)
    859 
    860             if isinstance(v, bytes):
    861                 v = quote_via(v, safe)
    862             else:
    863                 v = quote_via(str(v), safe, encoding, errors)
    864             l.append(k + '=' + v)
    865     else:
    866         for k, v in query:
    867             if isinstance(k, bytes):
    868                 k = quote_via(k, safe)
    869             else:
    870                 k = quote_via(str(k), safe, encoding, errors)
    871 
    872             if isinstance(v, bytes):
    873                 v = quote_via(v, safe)
    874                 l.append(k + '=' + v)
    875             elif isinstance(v, str):
    876                 v = quote_via(v, safe, encoding, errors)
    877                 l.append(k + '=' + v)
    878             else:
    879                 try:
    880                     # Is this a sufficient test for sequence-ness?
    881                     x = len(v)
    882                 except TypeError:
    883                     # not a sequence
    884                     v = quote_via(str(v), safe, encoding, errors)
    885                     l.append(k + '=' + v)
    886                 else:
    887                     # loop over the sequence
    888                     for elt in v:
    889                         if isinstance(elt, bytes):
    890                             elt = quote_via(elt, safe)
    891                         else:
    892                             elt = quote_via(str(elt), safe, encoding, errors)
    893                         l.append(k + '=' + elt)
    894     return '&'.join(l)
    895 
    896 def to_bytes(url):
    897     """to_bytes(u"URL") --> 'URL'."""
    898     # Most URL schemes require ASCII. If that changes, the conversion
    899     # can be relaxed.
    900     # XXX get rid of to_bytes()
    901     if isinstance(url, str):
    902         try:
    903             url = url.encode("ASCII").decode()
    904         except UnicodeError:
    905             raise UnicodeError("URL " + repr(url) +
    906                                " contains non-ASCII characters")
    907     return url
    908 
    909 def unwrap(url):
    910     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
    911     url = str(url).strip()
    912     if url[:1] == '<' and url[-1:] == '>':
    913         url = url[1:-1].strip()
    914     if url[:4] == 'URL:': url = url[4:].strip()
    915     return url
    916 
    917 _typeprog = None
    918 def splittype(url):
    919     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
    920     global _typeprog
    921     if _typeprog is None:
    922         _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
    923 
    924     match = _typeprog.match(url)
    925     if match:
    926         scheme, data = match.groups()
    927         return scheme.lower(), data
    928     return None, url
    929 
    930 _hostprog = None
    931 def splithost(url):
    932     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
    933     global _hostprog
    934     if _hostprog is None:
    935         _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
    936 
    937     match = _hostprog.match(url)
    938     if match:
    939         host_port, path = match.groups()
    940         if path and path[0] != '/':
    941             path = '/' + path
    942         return host_port, path
    943     return None, url
    944 
    945 def splituser(host):
    946     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
    947     user, delim, host = host.rpartition('@')
    948     return (user if delim else None), host
    949 
    950 def splitpasswd(user):
    951     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
    952     user, delim, passwd = user.partition(':')
    953     return user, (passwd if delim else None)
    954 
    955 # splittag('/path#tag') --> '/path', 'tag'
    956 _portprog = None
    957 def splitport(host):
    958     """splitport('host:port') --> 'host', 'port'."""
    959     global _portprog
    960     if _portprog is None:
    961         _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL)
    962 
    963     match = _portprog.match(host)
    964     if match:
    965         host, port = match.groups()
    966         if port:
    967             return host, port
    968     return host, None
    969 
    970 def splitnport(host, defport=-1):
    971     """Split host and port, returning numeric port.
    972     Return given default port if no ':' found; defaults to -1.
    973     Return numerical port if a valid number are found after ':'.
    974     Return None if ':' but not a valid number."""
    975     host, delim, port = host.rpartition(':')
    976     if not delim:
    977         host = port
    978     elif port:
    979         try:
    980             nport = int(port)
    981         except ValueError:
    982             nport = None
    983         return host, nport
    984     return host, defport
    985 
    986 def splitquery(url):
    987     """splitquery('/path?query') --> '/path', 'query'."""
    988     path, delim, query = url.rpartition('?')
    989     if delim:
    990         return path, query
    991     return url, None
    992 
    993 def splittag(url):
    994     """splittag('/path#tag') --> '/path', 'tag'."""
    995     path, delim, tag = url.rpartition('#')
    996     if delim:
    997         return path, tag
    998     return url, None
    999 
   1000 def splitattr(url):
   1001     """splitattr('/path;attr1=value1;attr2=value2;...') ->
   1002         '/path', ['attr1=value1', 'attr2=value2', ...]."""
   1003     words = url.split(';')
   1004     return words[0], words[1:]
   1005 
   1006 def splitvalue(attr):
   1007     """splitvalue('attr=value') --> 'attr', 'value'."""
   1008     attr, delim, value = attr.partition('=')
   1009     return attr, (value if delim else None)
   1010