Home | History | Annotate | Download | only in Lib
      1 """Parse (absolute and relative) URLs.
      2 
      3 urlparse module is based upon the following RFC specifications.
      4 
      5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
      6 and L.  Masinter, January 2005.
      7 
      8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
      9 and L.Masinter, December 1999.
     10 
     11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
     12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
     13 
     14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
     15 
     16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
     17 1995.
     18 
     19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
     20 McCahill, December 1994
     21 
     22 RFC 3986 is considered the current standard and any future changes to
     23 urlparse module should conform with it.  The urlparse module is
     24 currently not entirely compliant with this RFC due to defacto
     25 scenarios for parsing, and for backward compatibility purposes, some
     26 parsing quirks from older RFCs are retained. The testcases in
     27 test_urlparse.py provides a good indicator of parsing behavior.
     28 
     29 """
     30 
     31 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
     32            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
     33 
     34 # A classification of schemes ('' means apply by default)
     35 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
     36                  'wais', 'file', 'https', 'shttp', 'mms',
     37                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
     38 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
     39                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
     40                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
     41                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
     42 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
     43                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
     44 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
     45                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
     46                'mms', '', 'sftp']
     47 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
     48               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
     49 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
     50                  'nntp', 'wais', 'https', 'shttp', 'snews',
     51                  'file', 'prospero', '']
     52 
     53 # Characters valid in scheme names
     54 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
     55                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     56                 '0123456789'
     57                 '+-.')
     58 
     59 MAX_CACHE_SIZE = 20
     60 _parse_cache = {}
     61 
     62 def clear_cache():
     63     """Clear the parse cache."""
     64     _parse_cache.clear()
     65 
     66 
     67 class ResultMixin(object):
     68     """Shared methods for the parsed result objects."""
     69 
     70     @property
     71     def username(self):
     72         netloc = self.netloc
     73         if "@" in netloc:
     74             userinfo = netloc.rsplit("@", 1)[0]
     75             if ":" in userinfo:
     76                 userinfo = userinfo.split(":", 1)[0]
     77             return userinfo
     78         return None
     79 
     80     @property
     81     def password(self):
     82         netloc = self.netloc
     83         if "@" in netloc:
     84             userinfo = netloc.rsplit("@", 1)[0]
     85             if ":" in userinfo:
     86                 return userinfo.split(":", 1)[1]
     87         return None
     88 
     89     @property
     90     def hostname(self):
     91         netloc = self.netloc.split('@')[-1]
     92         if '[' in netloc and ']' in netloc:
     93             return netloc.split(']')[0][1:].lower()
     94         elif ':' in netloc:
     95             return netloc.split(':')[0].lower()
     96         elif netloc == '':
     97             return None
     98         else:
     99             return netloc.lower()
    100 
    101     @property
    102     def port(self):
    103         netloc = self.netloc.split('@')[-1].split(']')[-1]
    104         if ':' in netloc:
    105             port = netloc.split(':')[1]
    106             return int(port, 10)
    107         else:
    108             return None
    109 
    110 from collections import namedtuple
    111 
    112 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
    113 
    114     __slots__ = ()
    115 
    116     def geturl(self):
    117         return urlunsplit(self)
    118 
    119 
    120 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
    121 
    122     __slots__ = ()
    123 
    124     def geturl(self):
    125         return urlunparse(self)
    126 
    127 
    128 def urlparse(url, scheme='', allow_fragments=True):
    129     """Parse a URL into 6 components:
    130     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    131     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    132     Note that we don't break the components up in smaller bits
    133     (e.g. netloc is a single string) and we don't expand % escapes."""
    134     tuple = urlsplit(url, scheme, allow_fragments)
    135     scheme, netloc, url, query, fragment = tuple
    136     if scheme in uses_params and ';' in url:
    137         url, params = _splitparams(url)
    138     else:
    139         params = ''
    140     return ParseResult(scheme, netloc, url, params, query, fragment)
    141 
    142 def _splitparams(url):
    143     if '/'  in url:
    144         i = url.find(';', url.rfind('/'))
    145         if i < 0:
    146             return url, ''
    147     else:
    148         i = url.find(';')
    149     return url[:i], url[i+1:]
    150 
    151 def _splitnetloc(url, start=0):
    152     delim = len(url)   # position of end of domain part of url, default is end
    153     for c in '/?#':    # look for delimiters; the order is NOT important

    154         wdelim = url.find(c, start)        # find first of this delim

    155         if wdelim >= 0:                    # if found

    156             delim = min(delim, wdelim)     # use earliest delim position

    157     return url[start:delim], url[delim:]   # return (domain, rest)

    158 
    159 def urlsplit(url, scheme='', allow_fragments=True):
    160     """Parse a URL into 5 components:
    161     <scheme>://<netloc>/<path>?<query>#<fragment>
    162     Return a 5-tuple: (scheme, netloc, path, query, fragment).
    163     Note that we don't break the components up in smaller bits
    164     (e.g. netloc is a single string) and we don't expand % escapes."""
    165     allow_fragments = bool(allow_fragments)
    166     key = url, scheme, allow_fragments, type(url), type(scheme)
    167     cached = _parse_cache.get(key, None)
    168     if cached:
    169         return cached
    170     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

    171         clear_cache()
    172     netloc = query = fragment = ''
    173     i = url.find(':')
    174     if i > 0:
    175         if url[:i] == 'http': # optimize the common case

    176             scheme = url[:i].lower()
    177             url = url[i+1:]
    178             if url[:2] == '//':
    179                 netloc, url = _splitnetloc(url, 2)
    180                 if (('[' in netloc and ']' not in netloc) or
    181                         (']' in netloc and '[' not in netloc)):
    182                     raise ValueError("Invalid IPv6 URL")
    183             if allow_fragments and '#' in url:
    184                 url, fragment = url.split('#', 1)
    185             if '?' in url:
    186                 url, query = url.split('?', 1)
    187             v = SplitResult(scheme, netloc, url, query, fragment)
    188             _parse_cache[key] = v
    189             return v
    190         for c in url[:i]:
    191             if c not in scheme_chars:
    192                 break
    193         else:
    194             try:
    195                 # make sure "url" is not actually a port number (in which case

    196                 # "scheme" is really part of the path

    197                 _testportnum = int(url[i+1:])
    198             except ValueError:
    199                 scheme, url = url[:i].lower(), url[i+1:]
    200 
    201     if url[:2] == '//':
    202         netloc, url = _splitnetloc(url, 2)
    203         if (('[' in netloc and ']' not in netloc) or
    204                 (']' in netloc and '[' not in netloc)):
    205             raise ValueError("Invalid IPv6 URL")
    206     if allow_fragments and scheme in uses_fragment and '#' in url:
    207         url, fragment = url.split('#', 1)
    208     if scheme in uses_query and '?' in url:
    209         url, query = url.split('?', 1)
    210     v = SplitResult(scheme, netloc, url, query, fragment)
    211     _parse_cache[key] = v
    212     return v
    213 
    214 def urlunparse(data):
    215     """Put a parsed URL back together again.  This may result in a
    216     slightly different, but equivalent URL, if the URL that was parsed
    217     originally had redundant delimiters, e.g. a ? with an empty query
    218     (the draft states that these are equivalent)."""
    219     scheme, netloc, url, params, query, fragment = data
    220     if params:
    221         url = "%s;%s" % (url, params)
    222     return urlunsplit((scheme, netloc, url, query, fragment))
    223 
    224 def urlunsplit(data):
    225     """Combine the elements of a tuple as returned by urlsplit() into a
    226     complete URL as a string. The data argument can be any five-item iterable.
    227     This may result in a slightly different, but equivalent URL, if the URL that
    228     was parsed originally had unnecessary delimiters (for example, a ? with an
    229     empty query; the RFC states that these are equivalent)."""
    230     scheme, netloc, url, query, fragment = data
    231     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
    232         if url and url[:1] != '/': url = '/' + url
    233         url = '//' + (netloc or '') + url
    234     if scheme:
    235         url = scheme + ':' + url
    236     if query:
    237         url = url + '?' + query
    238     if fragment:
    239         url = url + '#' + fragment
    240     return url
    241 
    242 def urljoin(base, url, allow_fragments=True):
    243     """Join a base URL and a possibly relative URL to form an absolute
    244     interpretation of the latter."""
    245     if not base:
    246         return url
    247     if not url:
    248         return base
    249     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
    250             urlparse(base, '', allow_fragments)
    251     scheme, netloc, path, params, query, fragment = \
    252             urlparse(url, bscheme, allow_fragments)
    253     if scheme != bscheme or scheme not in uses_relative:
    254         return url
    255     if scheme in uses_netloc:
    256         if netloc:
    257             return urlunparse((scheme, netloc, path,
    258                                params, query, fragment))
    259         netloc = bnetloc
    260     if path[:1] == '/':
    261         return urlunparse((scheme, netloc, path,
    262                            params, query, fragment))
    263     if not path and not params:
    264         path = bpath
    265         params = bparams
    266         if not query:
    267             query = bquery
    268         return urlunparse((scheme, netloc, path,
    269                            params, query, fragment))
    270     segments = bpath.split('/')[:-1] + path.split('/')
    271     # XXX The stuff below is bogus in various ways...

    272     if segments[-1] == '.':
    273         segments[-1] = ''
    274     while '.' in segments:
    275         segments.remove('.')
    276     while 1:
    277         i = 1
    278         n = len(segments) - 1
    279         while i < n:
    280             if (segments[i] == '..'
    281                 and segments[i-1] not in ('', '..')):
    282                 del segments[i-1:i+1]
    283                 break
    284             i = i+1
    285         else:
    286             break
    287     if segments == ['', '..']:
    288         segments[-1] = ''
    289     elif len(segments) >= 2 and segments[-1] == '..':
    290         segments[-2:] = ['']
    291     return urlunparse((scheme, netloc, '/'.join(segments),
    292                        params, query, fragment))
    293 
    294 def urldefrag(url):
    295     """Removes any existing fragment from URL.
    296 
    297     Returns a tuple of the defragmented URL and the fragment.  If
    298     the URL contained no fragments, the second element is the
    299     empty string.
    300     """
    301     if '#' in url:
    302         s, n, p, a, q, frag = urlparse(url)
    303         defrag = urlunparse((s, n, p, a, q, ''))
    304         return defrag, frag
    305     else:
    306         return url, ''
    307 
    308 # unquote method for parse_qs and parse_qsl

    309 # Cannot use directly from urllib as it would create a circular reference

    310 # because urllib uses urlparse methods (urljoin).  If you update this function,

    311 # update it also in urllib.  This code duplication does not existin in Python3.

    312 
    313 _hexdig = '0123456789ABCDEFabcdef'
    314 _hextochr = dict((a+b, chr(int(a+b,16)))
    315                  for a in _hexdig for b in _hexdig)
    316 
    317 def unquote(s):
    318     """unquote('abc%20def') -> 'abc def'."""
    319     res = s.split('%')
    320     # fastpath

    321     if len(res) == 1:
    322         return s
    323     s = res[0]
    324     for item in res[1:]:
    325         try:
    326             s += _hextochr[item[:2]] + item[2:]
    327         except KeyError:
    328             s += '%' + item
    329         except UnicodeDecodeError:
    330             s += unichr(int(item[:2], 16)) + item[2:]
    331     return s
    332 
    333 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
    334     """Parse a query given as a string argument.
    335 
    336         Arguments:
    337 
    338         qs: percent-encoded query string to be parsed
    339 
    340         keep_blank_values: flag indicating whether blank values in
    341             percent-encoded queries should be treated as blank strings.
    342             A true value indicates that blanks should be retained as
    343             blank strings.  The default false value indicates that
    344             blank values are to be ignored and treated as if they were
    345             not included.
    346 
    347         strict_parsing: flag indicating what to do with parsing errors.
    348             If false (the default), errors are silently ignored.
    349             If true, errors raise a ValueError exception.
    350     """
    351     dict = {}
    352     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
    353         if name in dict:
    354             dict[name].append(value)
    355         else:
    356             dict[name] = [value]
    357     return dict
    358 
    359 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
    360     """Parse a query given as a string argument.
    361 
    362     Arguments:
    363 
    364     qs: percent-encoded query string to be parsed
    365 
    366     keep_blank_values: flag indicating whether blank values in
    367         percent-encoded queries should be treated as blank strings.  A
    368         true value indicates that blanks should be retained as blank
    369         strings.  The default false value indicates that blank values
    370         are to be ignored and treated as if they were  not included.
    371 
    372     strict_parsing: flag indicating what to do with parsing errors. If
    373         false (the default), errors are silently ignored. If true,
    374         errors raise a ValueError exception.
    375 
    376     Returns a list, as G-d intended.
    377     """
    378     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    379     r = []
    380     for name_value in pairs:
    381         if not name_value and not strict_parsing:
    382             continue
    383         nv = name_value.split('=', 1)
    384         if len(nv) != 2:
    385             if strict_parsing:
    386                 raise ValueError, "bad query field: %r" % (name_value,)
    387             # Handle case of a control-name with no equal sign

    388             if keep_blank_values:
    389                 nv.append('')
    390             else:
    391                 continue
    392         if len(nv[1]) or keep_blank_values:
    393             name = unquote(nv[0].replace('+', ' '))
    394             value = unquote(nv[1].replace('+', ' '))
    395             r.append((name, value))
    396 
    397     return r
    398