Home | History | Annotate | Download | only in python2.7
      1 """Parse (absolute and relative) URLs.
      2 
      3 urlparse module is based upon the following RFC specifications.
      4 
      5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
      6 and L.  Masinter, January 2005.
      7 
      8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
      9 and L.Masinter, December 1999.
     10 
     11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
     12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
     13 
     14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
     15 
     16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
     17 1995.
     18 
     19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
     20 McCahill, December 1994
     21 
     22 RFC 3986 is considered the current standard and any future changes to
     23 urlparse module should conform with it.  The urlparse module is
     24 currently not entirely compliant with this RFC due to defacto
     25 scenarios for parsing, and for backward compatibility purposes, some
     26 parsing quirks from older RFCs are retained. The testcases in
     27 test_urlparse.py provides a good indicator of parsing behavior.
     28 
     29 """
     30 
     31 import re
     32 
     33 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
     34            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
     35 
     36 # A classification of schemes ('' means apply by default)
     37 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
     38                  'wais', 'file', 'https', 'shttp', 'mms',
     39                  'prospero', 'rtsp', 'rtspu', '', 'sftp',
     40                  'svn', 'svn+ssh']
     41 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
     42                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
     43                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
     44                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
     45 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
     46                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
     47                'mms', '', 'sftp', 'tel']
     48 
     49 # These are not actually used anymore, but should stay for backwards
     50 # compatibility.  (They are undocumented, but have a public-looking name.)
     51 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
     52                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
     53 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
     54               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
     55 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
     56                  'nntp', 'wais', 'https', 'shttp', 'snews',
     57                  'file', 'prospero', '']
     58 
     59 # Characters valid in scheme names
     60 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
     61                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     62                 '0123456789'
     63                 '+-.')
     64 
     65 MAX_CACHE_SIZE = 20
     66 _parse_cache = {}
     67 
     68 def clear_cache():
     69     """Clear the parse cache."""
     70     _parse_cache.clear()
     71 
     72 
     73 class ResultMixin(object):
     74     """Shared methods for the parsed result objects."""
     75 
     76     @property
     77     def username(self):
     78         netloc = self.netloc
     79         if "@" in netloc:
     80             userinfo = netloc.rsplit("@", 1)[0]
     81             if ":" in userinfo:
     82                 userinfo = userinfo.split(":", 1)[0]
     83             return userinfo
     84         return None
     85 
     86     @property
     87     def password(self):
     88         netloc = self.netloc
     89         if "@" in netloc:
     90             userinfo = netloc.rsplit("@", 1)[0]
     91             if ":" in userinfo:
     92                 return userinfo.split(":", 1)[1]
     93         return None
     94 
     95     @property
     96     def hostname(self):
     97         netloc = self.netloc.split('@')[-1]
     98         if '[' in netloc and ']' in netloc:
     99             return netloc.split(']')[0][1:].lower()
    100         elif ':' in netloc:
    101             return netloc.split(':')[0].lower()
    102         elif netloc == '':
    103             return None
    104         else:
    105             return netloc.lower()
    106 
    107     @property
    108     def port(self):
    109         netloc = self.netloc.split('@')[-1].split(']')[-1]
    110         if ':' in netloc:
    111             port = netloc.split(':')[1]
    112             port = int(port, 10)
    113             # verify legal port
    114             if (0 <= port <= 65535):
    115                 return port
    116         return None
    117 
    118 from collections import namedtuple
    119 
    120 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
    121 
    122     __slots__ = ()
    123 
    124     def geturl(self):
    125         return urlunsplit(self)
    126 
    127 
    128 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
    129 
    130     __slots__ = ()
    131 
    132     def geturl(self):
    133         return urlunparse(self)
    134 
    135 
    136 def urlparse(url, scheme='', allow_fragments=True):
    137     """Parse a URL into 6 components:
    138     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    139     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    140     Note that we don't break the components up in smaller bits
    141     (e.g. netloc is a single string) and we don't expand % escapes."""
    142     tuple = urlsplit(url, scheme, allow_fragments)
    143     scheme, netloc, url, query, fragment = tuple
    144     if scheme in uses_params and ';' in url:
    145         url, params = _splitparams(url)
    146     else:
    147         params = ''
    148     return ParseResult(scheme, netloc, url, params, query, fragment)
    149 
    150 def _splitparams(url):
    151     if '/'  in url:
    152         i = url.find(';', url.rfind('/'))
    153         if i < 0:
    154             return url, ''
    155     else:
    156         i = url.find(';')
    157     return url[:i], url[i+1:]
    158 
    159 def _splitnetloc(url, start=0):
    160     delim = len(url)   # position of end of domain part of url, default is end
    161     for c in '/?#':    # look for delimiters; the order is NOT important
    162         wdelim = url.find(c, start)        # find first of this delim
    163         if wdelim >= 0:                    # if found
    164             delim = min(delim, wdelim)     # use earliest delim position
    165     return url[start:delim], url[delim:]   # return (domain, rest)
    166 
    167 def urlsplit(url, scheme='', allow_fragments=True):
    168     """Parse a URL into 5 components:
    169     <scheme>://<netloc>/<path>?<query>#<fragment>
    170     Return a 5-tuple: (scheme, netloc, path, query, fragment).
    171     Note that we don't break the components up in smaller bits
    172     (e.g. netloc is a single string) and we don't expand % escapes."""
    173     allow_fragments = bool(allow_fragments)
    174     key = url, scheme, allow_fragments, type(url), type(scheme)
    175     cached = _parse_cache.get(key, None)
    176     if cached:
    177         return cached
    178     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
    179         clear_cache()
    180     netloc = query = fragment = ''
    181     i = url.find(':')
    182     if i > 0:
    183         if url[:i] == 'http': # optimize the common case
    184             scheme = url[:i].lower()
    185             url = url[i+1:]
    186             if url[:2] == '//':
    187                 netloc, url = _splitnetloc(url, 2)
    188                 if (('[' in netloc and ']' not in netloc) or
    189                         (']' in netloc and '[' not in netloc)):
    190                     raise ValueError("Invalid IPv6 URL")
    191             if allow_fragments and '#' in url:
    192                 url, fragment = url.split('#', 1)
    193             if '?' in url:
    194                 url, query = url.split('?', 1)
    195             v = SplitResult(scheme, netloc, url, query, fragment)
    196             _parse_cache[key] = v
    197             return v
    198         for c in url[:i]:
    199             if c not in scheme_chars:
    200                 break
    201         else:
    202             # make sure "url" is not actually a port number (in which case
    203             # "scheme" is really part of the path)
    204             rest = url[i+1:]
    205             if not rest or any(c not in '0123456789' for c in rest):
    206                 # not a port number
    207                 scheme, url = url[:i].lower(), rest
    208 
    209     if url[:2] == '//':
    210         netloc, url = _splitnetloc(url, 2)
    211         if (('[' in netloc and ']' not in netloc) or
    212                 (']' in netloc and '[' not in netloc)):
    213             raise ValueError("Invalid IPv6 URL")
    214     if allow_fragments and '#' in url:
    215         url, fragment = url.split('#', 1)
    216     if '?' in url:
    217         url, query = url.split('?', 1)
    218     v = SplitResult(scheme, netloc, url, query, fragment)
    219     _parse_cache[key] = v
    220     return v
    221 
    222 def urlunparse(data):
    223     """Put a parsed URL back together again.  This may result in a
    224     slightly different, but equivalent URL, if the URL that was parsed
    225     originally had redundant delimiters, e.g. a ? with an empty query
    226     (the draft states that these are equivalent)."""
    227     scheme, netloc, url, params, query, fragment = data
    228     if params:
    229         url = "%s;%s" % (url, params)
    230     return urlunsplit((scheme, netloc, url, query, fragment))
    231 
    232 def urlunsplit(data):
    233     """Combine the elements of a tuple as returned by urlsplit() into a
    234     complete URL as a string. The data argument can be any five-item iterable.
    235     This may result in a slightly different, but equivalent URL, if the URL that
    236     was parsed originally had unnecessary delimiters (for example, a ? with an
    237     empty query; the RFC states that these are equivalent)."""
    238     scheme, netloc, url, query, fragment = data
    239     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
    240         if url and url[:1] != '/': url = '/' + url
    241         url = '//' + (netloc or '') + url
    242     if scheme:
    243         url = scheme + ':' + url
    244     if query:
    245         url = url + '?' + query
    246     if fragment:
    247         url = url + '#' + fragment
    248     return url
    249 
    250 def urljoin(base, url, allow_fragments=True):
    251     """Join a base URL and a possibly relative URL to form an absolute
    252     interpretation of the latter."""
    253     if not base:
    254         return url
    255     if not url:
    256         return base
    257     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
    258             urlparse(base, '', allow_fragments)
    259     scheme, netloc, path, params, query, fragment = \
    260             urlparse(url, bscheme, allow_fragments)
    261     if scheme != bscheme or scheme not in uses_relative:
    262         return url
    263     if scheme in uses_netloc:
    264         if netloc:
    265             return urlunparse((scheme, netloc, path,
    266                                params, query, fragment))
    267         netloc = bnetloc
    268     if path[:1] == '/':
    269         return urlunparse((scheme, netloc, path,
    270                            params, query, fragment))
    271     if not path and not params:
    272         path = bpath
    273         params = bparams
    274         if not query:
    275             query = bquery
    276         return urlunparse((scheme, netloc, path,
    277                            params, query, fragment))
    278     segments = bpath.split('/')[:-1] + path.split('/')
    279     # XXX The stuff below is bogus in various ways...
    280     if segments[-1] == '.':
    281         segments[-1] = ''
    282     while '.' in segments:
    283         segments.remove('.')
    284     while 1:
    285         i = 1
    286         n = len(segments) - 1
    287         while i < n:
    288             if (segments[i] == '..'
    289                 and segments[i-1] not in ('', '..')):
    290                 del segments[i-1:i+1]
    291                 break
    292             i = i+1
    293         else:
    294             break
    295     if segments == ['', '..']:
    296         segments[-1] = ''
    297     elif len(segments) >= 2 and segments[-1] == '..':
    298         segments[-2:] = ['']
    299     return urlunparse((scheme, netloc, '/'.join(segments),
    300                        params, query, fragment))
    301 
    302 def urldefrag(url):
    303     """Removes any existing fragment from URL.
    304 
    305     Returns a tuple of the defragmented URL and the fragment.  If
    306     the URL contained no fragments, the second element is the
    307     empty string.
    308     """
    309     if '#' in url:
    310         s, n, p, a, q, frag = urlparse(url)
    311         defrag = urlunparse((s, n, p, a, q, ''))
    312         return defrag, frag
    313     else:
    314         return url, ''
    315 
    316 try:
    317     unicode
    318 except NameError:
    319     def _is_unicode(x):
    320         return 0
    321 else:
    322     def _is_unicode(x):
    323         return isinstance(x, unicode)
    324 
    325 # unquote method for parse_qs and parse_qsl
    326 # Cannot use directly from urllib as it would create a circular reference
    327 # because urllib uses urlparse methods (urljoin).  If you update this function,
    328 # update it also in urllib.  This code duplication does not existin in Python3.
    329 
    330 _hexdig = '0123456789ABCDEFabcdef'
    331 _hextochr = dict((a+b, chr(int(a+b,16)))
    332                  for a in _hexdig for b in _hexdig)
    333 _asciire = re.compile('([\x00-\x7f]+)')
    334 
    335 def unquote(s):
    336     """unquote('abc%20def') -> 'abc def'."""
    337     if _is_unicode(s):
    338         if '%' not in s:
    339             return s
    340         bits = _asciire.split(s)
    341         res = [bits[0]]
    342         append = res.append
    343         for i in range(1, len(bits), 2):
    344             append(unquote(str(bits[i])).decode('latin1'))
    345             append(bits[i + 1])
    346         return ''.join(res)
    347 
    348     bits = s.split('%')
    349     # fastpath
    350     if len(bits) == 1:
    351         return s
    352     res = [bits[0]]
    353     append = res.append
    354     for item in bits[1:]:
    355         try:
    356             append(_hextochr[item[:2]])
    357             append(item[2:])
    358         except KeyError:
    359             append('%')
    360             append(item)
    361     return ''.join(res)
    362 
    363 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
    364     """Parse a query given as a string argument.
    365 
    366         Arguments:
    367 
    368         qs: percent-encoded query string to be parsed
    369 
    370         keep_blank_values: flag indicating whether blank values in
    371             percent-encoded queries should be treated as blank strings.
    372             A true value indicates that blanks should be retained as
    373             blank strings.  The default false value indicates that
    374             blank values are to be ignored and treated as if they were
    375             not included.
    376 
    377         strict_parsing: flag indicating what to do with parsing errors.
    378             If false (the default), errors are silently ignored.
    379             If true, errors raise a ValueError exception.
    380     """
    381     dict = {}
    382     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
    383         if name in dict:
    384             dict[name].append(value)
    385         else:
    386             dict[name] = [value]
    387     return dict
    388 
    389 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
    390     """Parse a query given as a string argument.
    391 
    392     Arguments:
    393 
    394     qs: percent-encoded query string to be parsed
    395 
    396     keep_blank_values: flag indicating whether blank values in
    397         percent-encoded queries should be treated as blank strings.  A
    398         true value indicates that blanks should be retained as blank
    399         strings.  The default false value indicates that blank values
    400         are to be ignored and treated as if they were  not included.
    401 
    402     strict_parsing: flag indicating what to do with parsing errors. If
    403         false (the default), errors are silently ignored. If true,
    404         errors raise a ValueError exception.
    405 
    406     Returns a list, as G-d intended.
    407     """
    408     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    409     r = []
    410     for name_value in pairs:
    411         if not name_value and not strict_parsing:
    412             continue
    413         nv = name_value.split('=', 1)
    414         if len(nv) != 2:
    415             if strict_parsing:
    416                 raise ValueError, "bad query field: %r" % (name_value,)
    417             # Handle case of a control-name with no equal sign
    418             if keep_blank_values:
    419                 nv.append('')
    420             else:
    421                 continue
    422         if len(nv[1]) or keep_blank_values:
    423             name = unquote(nv[0].replace('+', ' '))
    424             value = unquote(nv[1].replace('+', ' '))
    425             r.append((name, value))
    426 
    427     return r
    428