Home | History | Annotate | Download | only in Lib
      1 """Parse (absolute and relative) URLs.
      2 
      3 urlparse module is based upon the following RFC specifications.
      4 
      5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
      6 and L.  Masinter, January 2005.
      7 
      8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
      9 and L.Masinter, December 1999.
     10 
     11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
     12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
     13 
     14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
     15 
     16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
     17 1995.
     18 
     19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
     20 McCahill, December 1994
     21 
     22 RFC 3986 is considered the current standard and any future changes to
     23 urlparse module should conform with it.  The urlparse module is
     24 currently not entirely compliant with this RFC due to defacto
     25 scenarios for parsing, and for backward compatibility purposes, some
     26 parsing quirks from older RFCs are retained. The testcases in
     27 test_urlparse.py provides a good indicator of parsing behavior.
     28 
     29 """
     30 
     31 import re
     32 
     33 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
     34            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
     35 
     36 # A classification of schemes ('' means apply by default)
     37 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
     38                  'wais', 'file', 'https', 'shttp', 'mms',
     39                  'prospero', 'rtsp', 'rtspu', '', 'sftp',
     40                  'svn', 'svn+ssh']
     41 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
     42                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
     43                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
     44                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
     45 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
     46                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
     47                'mms', '', 'sftp', 'tel']
     48 
     49 # These are not actually used anymore, but should stay for backwards
     50 # compatibility.  (They are undocumented, but have a public-looking name.)
     51 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
     52                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
     53 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
     54               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
     55 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
     56                  'nntp', 'wais', 'https', 'shttp', 'snews',
     57                  'file', 'prospero', '']
     58 
     59 # Characters valid in scheme names
     60 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
     61                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     62                 '0123456789'
     63                 '+-.')
     64 
     65 MAX_CACHE_SIZE = 20
     66 _parse_cache = {}
     67 
     68 def clear_cache():
     69     """Clear the parse cache."""
     70     _parse_cache.clear()
     71 
     72 
     73 class ResultMixin(object):
     74     """Shared methods for the parsed result objects."""
     75 
     76     @property
     77     def username(self):
     78         netloc = self.netloc
     79         if "@" in netloc:
     80             userinfo = netloc.rsplit("@", 1)[0]
     81             if ":" in userinfo:
     82                 userinfo = userinfo.split(":", 1)[0]
     83             return userinfo
     84         return None
     85 
     86     @property
     87     def password(self):
     88         netloc = self.netloc
     89         if "@" in netloc:
     90             userinfo = netloc.rsplit("@", 1)[0]
     91             if ":" in userinfo:
     92                 return userinfo.split(":", 1)[1]
     93         return None
     94 
     95     @property
     96     def hostname(self):
     97         netloc = self.netloc.split('@')[-1]
     98         if '[' in netloc and ']' in netloc:
     99             return netloc.split(']')[0][1:].lower()
    100         elif ':' in netloc:
    101             return netloc.split(':')[0].lower()
    102         elif netloc == '':
    103             return None
    104         else:
    105             return netloc.lower()
    106 
    107     @property
    108     def port(self):
    109         netloc = self.netloc.split('@')[-1].split(']')[-1]
    110         if ':' in netloc:
    111             port = netloc.split(':')[1]
    112             if port:
    113                 port = int(port, 10)
    114                 # verify legal port
    115                 if (0 <= port <= 65535):
    116                     return port
    117         return None
    118 
    119 from collections import namedtuple
    120 
    121 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
    122 
    123     __slots__ = ()
    124 
    125     def geturl(self):
    126         return urlunsplit(self)
    127 
    128 
    129 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
    130 
    131     __slots__ = ()
    132 
    133     def geturl(self):
    134         return urlunparse(self)
    135 
    136 
    137 def urlparse(url, scheme='', allow_fragments=True):
    138     """Parse a URL into 6 components:
    139     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    140     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    141     Note that we don't break the components up in smaller bits
    142     (e.g. netloc is a single string) and we don't expand % escapes."""
    143     tuple = urlsplit(url, scheme, allow_fragments)
    144     scheme, netloc, url, query, fragment = tuple
    145     if scheme in uses_params and ';' in url:
    146         url, params = _splitparams(url)
    147     else:
    148         params = ''
    149     return ParseResult(scheme, netloc, url, params, query, fragment)
    150 
    151 def _splitparams(url):
    152     if '/'  in url:
    153         i = url.find(';', url.rfind('/'))
    154         if i < 0:
    155             return url, ''
    156     else:
    157         i = url.find(';')
    158     return url[:i], url[i+1:]
    159 
    160 def _splitnetloc(url, start=0):
    161     delim = len(url)   # position of end of domain part of url, default is end
    162     for c in '/?#':    # look for delimiters; the order is NOT important
    163         wdelim = url.find(c, start)        # find first of this delim
    164         if wdelim >= 0:                    # if found
    165             delim = min(delim, wdelim)     # use earliest delim position
    166     return url[start:delim], url[delim:]   # return (domain, rest)
    167 
    168 def urlsplit(url, scheme='', allow_fragments=True):
    169     """Parse a URL into 5 components:
    170     <scheme>://<netloc>/<path>?<query>#<fragment>
    171     Return a 5-tuple: (scheme, netloc, path, query, fragment).
    172     Note that we don't break the components up in smaller bits
    173     (e.g. netloc is a single string) and we don't expand % escapes."""
    174     allow_fragments = bool(allow_fragments)
    175     key = url, scheme, allow_fragments, type(url), type(scheme)
    176     cached = _parse_cache.get(key, None)
    177     if cached:
    178         return cached
    179     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
    180         clear_cache()
    181     netloc = query = fragment = ''
    182     i = url.find(':')
    183     if i > 0:
    184         if url[:i] == 'http': # optimize the common case
    185             scheme = url[:i].lower()
    186             url = url[i+1:]
    187             if url[:2] == '//':
    188                 netloc, url = _splitnetloc(url, 2)
    189                 if (('[' in netloc and ']' not in netloc) or
    190                         (']' in netloc and '[' not in netloc)):
    191                     raise ValueError("Invalid IPv6 URL")
    192             if allow_fragments and '#' in url:
    193                 url, fragment = url.split('#', 1)
    194             if '?' in url:
    195                 url, query = url.split('?', 1)
    196             v = SplitResult(scheme, netloc, url, query, fragment)
    197             _parse_cache[key] = v
    198             return v
    199         for c in url[:i]:
    200             if c not in scheme_chars:
    201                 break
    202         else:
    203             # make sure "url" is not actually a port number (in which case
    204             # "scheme" is really part of the path)
    205             rest = url[i+1:]
    206             if not rest or any(c not in '0123456789' for c in rest):
    207                 # not a port number
    208                 scheme, url = url[:i].lower(), rest
    209 
    210     if url[:2] == '//':
    211         netloc, url = _splitnetloc(url, 2)
    212         if (('[' in netloc and ']' not in netloc) or
    213                 (']' in netloc and '[' not in netloc)):
    214             raise ValueError("Invalid IPv6 URL")
    215     if allow_fragments and '#' in url:
    216         url, fragment = url.split('#', 1)
    217     if '?' in url:
    218         url, query = url.split('?', 1)
    219     v = SplitResult(scheme, netloc, url, query, fragment)
    220     _parse_cache[key] = v
    221     return v
    222 
    223 def urlunparse(data):
    224     """Put a parsed URL back together again.  This may result in a
    225     slightly different, but equivalent URL, if the URL that was parsed
    226     originally had redundant delimiters, e.g. a ? with an empty query
    227     (the draft states that these are equivalent)."""
    228     scheme, netloc, url, params, query, fragment = data
    229     if params:
    230         url = "%s;%s" % (url, params)
    231     return urlunsplit((scheme, netloc, url, query, fragment))
    232 
    233 def urlunsplit(data):
    234     """Combine the elements of a tuple as returned by urlsplit() into a
    235     complete URL as a string. The data argument can be any five-item iterable.
    236     This may result in a slightly different, but equivalent URL, if the URL that
    237     was parsed originally had unnecessary delimiters (for example, a ? with an
    238     empty query; the RFC states that these are equivalent)."""
    239     scheme, netloc, url, query, fragment = data
    240     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
    241         if url and url[:1] != '/': url = '/' + url
    242         url = '//' + (netloc or '') + url
    243     if scheme:
    244         url = scheme + ':' + url
    245     if query:
    246         url = url + '?' + query
    247     if fragment:
    248         url = url + '#' + fragment
    249     return url
    250 
    251 def urljoin(base, url, allow_fragments=True):
    252     """Join a base URL and a possibly relative URL to form an absolute
    253     interpretation of the latter."""
    254     if not base:
    255         return url
    256     if not url:
    257         return base
    258     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
    259             urlparse(base, '', allow_fragments)
    260     scheme, netloc, path, params, query, fragment = \
    261             urlparse(url, bscheme, allow_fragments)
    262     if scheme != bscheme or scheme not in uses_relative:
    263         return url
    264     if scheme in uses_netloc:
    265         if netloc:
    266             return urlunparse((scheme, netloc, path,
    267                                params, query, fragment))
    268         netloc = bnetloc
    269     if path[:1] == '/':
    270         return urlunparse((scheme, netloc, path,
    271                            params, query, fragment))
    272     if not path and not params:
    273         path = bpath
    274         params = bparams
    275         if not query:
    276             query = bquery
    277         return urlunparse((scheme, netloc, path,
    278                            params, query, fragment))
    279     segments = bpath.split('/')[:-1] + path.split('/')
    280     # XXX The stuff below is bogus in various ways...
    281     if segments[-1] == '.':
    282         segments[-1] = ''
    283     while '.' in segments:
    284         segments.remove('.')
    285     while 1:
    286         i = 1
    287         n = len(segments) - 1
    288         while i < n:
    289             if (segments[i] == '..'
    290                 and segments[i-1] not in ('', '..')):
    291                 del segments[i-1:i+1]
    292                 break
    293             i = i+1
    294         else:
    295             break
    296     if segments == ['', '..']:
    297         segments[-1] = ''
    298     elif len(segments) >= 2 and segments[-1] == '..':
    299         segments[-2:] = ['']
    300     return urlunparse((scheme, netloc, '/'.join(segments),
    301                        params, query, fragment))
    302 
    303 def urldefrag(url):
    304     """Removes any existing fragment from URL.
    305 
    306     Returns a tuple of the defragmented URL and the fragment.  If
    307     the URL contained no fragments, the second element is the
    308     empty string.
    309     """
    310     if '#' in url:
    311         s, n, p, a, q, frag = urlparse(url)
    312         defrag = urlunparse((s, n, p, a, q, ''))
    313         return defrag, frag
    314     else:
    315         return url, ''
    316 
    317 try:
    318     unicode
    319 except NameError:
    320     def _is_unicode(x):
    321         return 0
    322 else:
    323     def _is_unicode(x):
    324         return isinstance(x, unicode)
    325 
    326 # unquote method for parse_qs and parse_qsl
    327 # Cannot use directly from urllib as it would create a circular reference
    328 # because urllib uses urlparse methods (urljoin).  If you update this function,
    329 # update it also in urllib.  This code duplication does not existin in Python3.
    330 
    331 _hexdig = '0123456789ABCDEFabcdef'
    332 _hextochr = dict((a+b, chr(int(a+b,16)))
    333                  for a in _hexdig for b in _hexdig)
    334 _asciire = re.compile('([\x00-\x7f]+)')
    335 
    336 def unquote(s):
    337     """unquote('abc%20def') -> 'abc def'."""
    338     if _is_unicode(s):
    339         if '%' not in s:
    340             return s
    341         bits = _asciire.split(s)
    342         res = [bits[0]]
    343         append = res.append
    344         for i in range(1, len(bits), 2):
    345             append(unquote(str(bits[i])).decode('latin1'))
    346             append(bits[i + 1])
    347         return ''.join(res)
    348 
    349     bits = s.split('%')
    350     # fastpath
    351     if len(bits) == 1:
    352         return s
    353     res = [bits[0]]
    354     append = res.append
    355     for item in bits[1:]:
    356         try:
    357             append(_hextochr[item[:2]])
    358             append(item[2:])
    359         except KeyError:
    360             append('%')
    361             append(item)
    362     return ''.join(res)
    363 
    364 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
    365     """Parse a query given as a string argument.
    366 
    367         Arguments:
    368 
    369         qs: percent-encoded query string to be parsed
    370 
    371         keep_blank_values: flag indicating whether blank values in
    372             percent-encoded queries should be treated as blank strings.
    373             A true value indicates that blanks should be retained as
    374             blank strings.  The default false value indicates that
    375             blank values are to be ignored and treated as if they were
    376             not included.
    377 
    378         strict_parsing: flag indicating what to do with parsing errors.
    379             If false (the default), errors are silently ignored.
    380             If true, errors raise a ValueError exception.
    381     """
    382     dict = {}
    383     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
    384         if name in dict:
    385             dict[name].append(value)
    386         else:
    387             dict[name] = [value]
    388     return dict
    389 
    390 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
    391     """Parse a query given as a string argument.
    392 
    393     Arguments:
    394 
    395     qs: percent-encoded query string to be parsed
    396 
    397     keep_blank_values: flag indicating whether blank values in
    398         percent-encoded queries should be treated as blank strings.  A
    399         true value indicates that blanks should be retained as blank
    400         strings.  The default false value indicates that blank values
    401         are to be ignored and treated as if they were  not included.
    402 
    403     strict_parsing: flag indicating what to do with parsing errors. If
    404         false (the default), errors are silently ignored. If true,
    405         errors raise a ValueError exception.
    406 
    407     Returns a list, as G-d intended.
    408     """
    409     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    410     r = []
    411     for name_value in pairs:
    412         if not name_value and not strict_parsing:
    413             continue
    414         nv = name_value.split('=', 1)
    415         if len(nv) != 2:
    416             if strict_parsing:
    417                 raise ValueError, "bad query field: %r" % (name_value,)
    418             # Handle case of a control-name with no equal sign
    419             if keep_blank_values:
    420                 nv.append('')
    421             else:
    422                 continue
    423         if len(nv[1]) or keep_blank_values:
    424             name = unquote(nv[0].replace('+', ' '))
    425             value = unquote(nv[1].replace('+', ' '))
    426             r.append((name, value))
    427 
    428     return r
    429