Home | History | Annotate | Download | only in python2.7
      1 r"""HTTP cookie handling for web clients.
      2 
      3 This module has (now fairly distant) origins in Gisle Aas' Perl module
      4 HTTP::Cookies, from the libwww-perl library.
      5 
      6 Docstrings, comments and debug strings in this code refer to the
      7 attributes of the HTTP cookie system as cookie-attributes, to distinguish
      8 them clearly from Python attributes.
      9 
     10 Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
     11 distributed with the Python standard library, but are available from
     12 http://wwwsearch.sf.net/):
     13 
     14                         CookieJar____
     15                         /     \      \
     16             FileCookieJar      \      \
     17              /    |   \         \      \
     18  MozillaCookieJar | LWPCookieJar \      \
     19                   |               |      \
     20                   |   ---MSIEBase |       \
     21                   |  /      |     |        \
     22                   | /   MSIEDBCookieJar BSDDBCookieJar
     23                   |/
     24                MSIECookieJar
     25 
     26 """
     27 
     28 __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
     29            'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
     30            'MozillaCookieJar']
     31 
     32 import re, urlparse, copy, time, urllib
     33 try:
     34     import threading as _threading
     35 except ImportError:
     36     import dummy_threading as _threading
     37 import httplib  # only for the default HTTP port
     38 from calendar import timegm
     39 
     40 debug = False   # set to True to enable debugging via the logging module
     41 logger = None
     42 
     43 def _debug(*args):
     44     if not debug:
     45         return
     46     global logger
     47     if not logger:
     48         import logging
     49         logger = logging.getLogger("cookielib")
     50     return logger.debug(*args)
     51 
     52 
     53 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
     54 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
     55                          "instance initialised with one)")
     56 
     57 def _warn_unhandled_exception():
     58     # There are a few catch-all except: statements in this module, for
     59     # catching input that's bad in unexpected ways.  Warn if any
     60     # exceptions are caught there.
     61     import warnings, traceback, StringIO
     62     f = StringIO.StringIO()
     63     traceback.print_exc(None, f)
     64     msg = f.getvalue()
     65     warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
     66 
     67 
     68 # Date/time conversion
     69 # -----------------------------------------------------------------------------
     70 
     71 EPOCH_YEAR = 1970
     72 def _timegm(tt):
     73     year, month, mday, hour, min, sec = tt[:6]
     74     if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
     75         (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
     76         return timegm(tt)
     77     else:
     78         return None
     79 
     80 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
     81 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
     82           "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
     83 MONTHS_LOWER = []
     84 for month in MONTHS: MONTHS_LOWER.append(month.lower())
     85 
     86 def time2isoz(t=None):
     87     """Return a string representing time in seconds since epoch, t.
     88 
     89     If the function is called without an argument, it will use the current
     90     time.
     91 
     92     The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
     93     representing Universal Time (UTC, aka GMT).  An example of this format is:
     94 
     95     1994-11-24 08:49:37Z
     96 
     97     """
     98     if t is None: t = time.time()
     99     year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
    100     return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
    101         year, mon, mday, hour, min, sec)
    102 
    103 def time2netscape(t=None):
    104     """Return a string representing time in seconds since epoch, t.
    105 
    106     If the function is called without an argument, it will use the current
    107     time.
    108 
    109     The format of the returned string is like this:
    110 
    111     Wed, DD-Mon-YYYY HH:MM:SS GMT
    112 
    113     """
    114     if t is None: t = time.time()
    115     year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
    116     return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
    117         DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
    118 
    119 
    120 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
    121 
    122 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
    123 def offset_from_tz_string(tz):
    124     offset = None
    125     if tz in UTC_ZONES:
    126         offset = 0
    127     else:
    128         m = TIMEZONE_RE.search(tz)
    129         if m:
    130             offset = 3600 * int(m.group(2))
    131             if m.group(3):
    132                 offset = offset + 60 * int(m.group(3))
    133             if m.group(1) == '-':
    134                 offset = -offset
    135     return offset
    136 
    137 def _str2time(day, mon, yr, hr, min, sec, tz):
    138     # translate month name to number
    139     # month numbers start with 1 (January)
    140     try:
    141         mon = MONTHS_LOWER.index(mon.lower())+1
    142     except ValueError:
    143         # maybe it's already a number
    144         try:
    145             imon = int(mon)
    146         except ValueError:
    147             return None
    148         if 1 <= imon <= 12:
    149             mon = imon
    150         else:
    151             return None
    152 
    153     # make sure clock elements are defined
    154     if hr is None: hr = 0
    155     if min is None: min = 0
    156     if sec is None: sec = 0
    157 
    158     yr = int(yr)
    159     day = int(day)
    160     hr = int(hr)
    161     min = int(min)
    162     sec = int(sec)
    163 
    164     if yr < 1000:
    165         # find "obvious" year
    166         cur_yr = time.localtime(time.time())[0]
    167         m = cur_yr % 100
    168         tmp = yr
    169         yr = yr + cur_yr - m
    170         m = m - tmp
    171         if abs(m) > 50:
    172             if m > 0: yr = yr + 100
    173             else: yr = yr - 100
    174 
    175     # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
    176     t = _timegm((yr, mon, day, hr, min, sec, tz))
    177 
    178     if t is not None:
    179         # adjust time using timezone string, to get absolute time since epoch
    180         if tz is None:
    181             tz = "UTC"
    182         tz = tz.upper()
    183         offset = offset_from_tz_string(tz)
    184         if offset is None:
    185             return None
    186         t = t - offset
    187 
    188     return t
    189 
    190 STRICT_DATE_RE = re.compile(
    191     r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
    192     "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
    193 WEEKDAY_RE = re.compile(
    194     r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
    195 LOOSE_HTTP_DATE_RE = re.compile(
    196     r"""^
    197     (\d\d?)            # day
    198        (?:\s+|[-\/])
    199     (\w+)              # month
    200         (?:\s+|[-\/])
    201     (\d+)              # year
    202     (?:
    203           (?:\s+|:)    # separator before clock
    204        (\d\d?):(\d\d)  # hour:min
    205        (?::(\d\d))?    # optional seconds
    206     )?                 # optional clock
    207        \s*
    208     ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
    209        \s*
    210     (?:\(\w+\))?       # ASCII representation of timezone in parens.
    211        \s*$""", re.X)
    212 def http2time(text):
    213     """Returns time in seconds since epoch of time represented by a string.
    214 
    215     Return value is an integer.
    216 
    217     None is returned if the format of str is unrecognized, the time is outside
    218     the representable range, or the timezone string is not recognized.  If the
    219     string contains no timezone, UTC is assumed.
    220 
    221     The timezone in the string may be numerical (like "-0800" or "+0100") or a
    222     string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
    223     timezone strings equivalent to UTC (zero offset) are known to the function.
    224 
    225     The function loosely parses the following formats:
    226 
    227     Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
    228     Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
    229     Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
    230     09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
    231     08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
    232     08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
    233 
    234     The parser ignores leading and trailing whitespace.  The time may be
    235     absent.
    236 
    237     If the year is given with only 2 digits, the function will select the
    238     century that makes the year closest to the current date.
    239 
    240     """
    241     # fast exit for strictly conforming string
    242     m = STRICT_DATE_RE.search(text)
    243     if m:
    244         g = m.groups()
    245         mon = MONTHS_LOWER.index(g[1].lower()) + 1
    246         tt = (int(g[2]), mon, int(g[0]),
    247               int(g[3]), int(g[4]), float(g[5]))
    248         return _timegm(tt)
    249 
    250     # No, we need some messy parsing...
    251 
    252     # clean up
    253     text = text.lstrip()
    254     text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
    255 
    256     # tz is time zone specifier string
    257     day, mon, yr, hr, min, sec, tz = [None]*7
    258 
    259     # loose regexp parse
    260     m = LOOSE_HTTP_DATE_RE.search(text)
    261     if m is not None:
    262         day, mon, yr, hr, min, sec, tz = m.groups()
    263     else:
    264         return None  # bad format
    265 
    266     return _str2time(day, mon, yr, hr, min, sec, tz)
    267 
    268 ISO_DATE_RE = re.compile(
    269     """^
    270     (\d{4})              # year
    271        [-\/]?
    272     (\d\d?)              # numerical month
    273        [-\/]?
    274     (\d\d?)              # day
    275    (?:
    276          (?:\s+|[-:Tt])  # separator before clock
    277       (\d\d?):?(\d\d)    # hour:min
    278       (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
    279    )?                    # optional clock
    280       \s*
    281    ([-+]?\d\d?:?(:?\d\d)?
    282     |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
    283       \s*$""", re.X)
    284 def iso2time(text):
    285     """
    286     As for http2time, but parses the ISO 8601 formats:
    287 
    288     1994-02-03 14:15:29 -0100    -- ISO 8601 format
    289     1994-02-03 14:15:29          -- zone is optional
    290     1994-02-03                   -- only date
    291     1994-02-03T14:15:29          -- Use T as separator
    292     19940203T141529Z             -- ISO 8601 compact format
    293     19940203                     -- only date
    294 
    295     """
    296     # clean up
    297     text = text.lstrip()
    298 
    299     # tz is time zone specifier string
    300     day, mon, yr, hr, min, sec, tz = [None]*7
    301 
    302     # loose regexp parse
    303     m = ISO_DATE_RE.search(text)
    304     if m is not None:
    305         # XXX there's an extra bit of the timezone I'm ignoring here: is
    306         #   this the right thing to do?
    307         yr, mon, day, hr, min, sec, tz, _ = m.groups()
    308     else:
    309         return None  # bad format
    310 
    311     return _str2time(day, mon, yr, hr, min, sec, tz)
    312 
    313 
    314 # Header parsing
    315 # -----------------------------------------------------------------------------
    316 
    317 def unmatched(match):
    318     """Return unmatched part of re.Match object."""
    319     start, end = match.span(0)
    320     return match.string[:start]+match.string[end:]
    321 
    322 HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
    323 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
    324 HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
    325 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
    326 def split_header_words(header_values):
    327     r"""Parse header values into a list of lists containing key,value pairs.
    328 
    329     The function knows how to deal with ",", ";" and "=" as well as quoted
    330     values after "=".  A list of space separated tokens are parsed as if they
    331     were separated by ";".
    332 
    333     If the header_values passed as argument contains multiple values, then they
    334     are treated as if they were a single value separated by comma ",".
    335 
    336     This means that this function is useful for parsing header fields that
    337     follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
    338     the requirement for tokens).
    339 
    340       headers           = #header
    341       header            = (token | parameter) *( [";"] (token | parameter))
    342 
    343       token             = 1*<any CHAR except CTLs or separators>
    344       separators        = "(" | ")" | "<" | ">" | "@"
    345                         | "," | ";" | ":" | "\" | <">
    346                         | "/" | "[" | "]" | "?" | "="
    347                         | "{" | "}" | SP | HT
    348 
    349       quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
    350       qdtext            = <any TEXT except <">>
    351       quoted-pair       = "\" CHAR
    352 
    353       parameter         = attribute "=" value
    354       attribute         = token
    355       value             = token | quoted-string
    356 
    357     Each header is represented by a list of key/value pairs.  The value for a
    358     simple token (not part of a parameter) is None.  Syntactically incorrect
    359     headers will not necessarily be parsed as you would want.
    360 
    361     This is easier to describe with some examples:
    362 
    363     >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
    364     [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
    365     >>> split_header_words(['text/html; charset="iso-8859-1"'])
    366     [[('text/html', None), ('charset', 'iso-8859-1')]]
    367     >>> split_header_words([r'Basic realm="\"foo\bar\""'])
    368     [[('Basic', None), ('realm', '"foobar"')]]
    369 
    370     """
    371     assert not isinstance(header_values, basestring)
    372     result = []
    373     for text in header_values:
    374         orig_text = text
    375         pairs = []
    376         while text:
    377             m = HEADER_TOKEN_RE.search(text)
    378             if m:
    379                 text = unmatched(m)
    380                 name = m.group(1)
    381                 m = HEADER_QUOTED_VALUE_RE.search(text)
    382                 if m:  # quoted value
    383                     text = unmatched(m)
    384                     value = m.group(1)
    385                     value = HEADER_ESCAPE_RE.sub(r"\1", value)
    386                 else:
    387                     m = HEADER_VALUE_RE.search(text)
    388                     if m:  # unquoted value
    389                         text = unmatched(m)
    390                         value = m.group(1)
    391                         value = value.rstrip()
    392                     else:
    393                         # no value, a lone token
    394                         value = None
    395                 pairs.append((name, value))
    396             elif text.lstrip().startswith(","):
    397                 # concatenated headers, as per RFC 2616 section 4.2
    398                 text = text.lstrip()[1:]
    399                 if pairs: result.append(pairs)
    400                 pairs = []
    401             else:
    402                 # skip junk
    403                 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
    404                 assert nr_junk_chars > 0, (
    405                     "split_header_words bug: '%s', '%s', %s" %
    406                     (orig_text, text, pairs))
    407                 text = non_junk
    408         if pairs: result.append(pairs)
    409     return result
    410 
    411 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
    412 def join_header_words(lists):
    413     """Do the inverse (almost) of the conversion done by split_header_words.
    414 
    415     Takes a list of lists of (key, value) pairs and produces a single header
    416     value.  Attribute values are quoted if needed.
    417 
    418     >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
    419     'text/plain; charset="iso-8859/1"'
    420     >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
    421     'text/plain, charset="iso-8859/1"'
    422 
    423     """
    424     headers = []
    425     for pairs in lists:
    426         attr = []
    427         for k, v in pairs:
    428             if v is not None:
    429                 if not re.search(r"^\w+$", v):
    430                     v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
    431                     v = '"%s"' % v
    432                 k = "%s=%s" % (k, v)
    433             attr.append(k)
    434         if attr: headers.append("; ".join(attr))
    435     return ", ".join(headers)
    436 
    437 def _strip_quotes(text):
    438     if text.startswith('"'):
    439         text = text[1:]
    440     if text.endswith('"'):
    441         text = text[:-1]
    442     return text
    443 
    444 def parse_ns_headers(ns_headers):
    445     """Ad-hoc parser for Netscape protocol cookie-attributes.
    446 
    447     The old Netscape cookie format for Set-Cookie can for instance contain
    448     an unquoted "," in the expires field, so we have to use this ad-hoc
    449     parser instead of split_header_words.
    450 
    451     XXX This may not make the best possible effort to parse all the crap
    452     that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
    453     parser is probably better, so could do worse than following that if
    454     this ever gives any trouble.
    455 
    456     Currently, this is also used for parsing RFC 2109 cookies.
    457 
    458     """
    459     known_attrs = ("expires", "domain", "path", "secure",
    460                    # RFC 2109 attrs (may turn up in Netscape cookies, too)
    461                    "version", "port", "max-age")
    462 
    463     result = []
    464     for ns_header in ns_headers:
    465         pairs = []
    466         version_set = False
    467         for ii, param in enumerate(re.split(r";\s*", ns_header)):
    468             param = param.rstrip()
    469             if param == "": continue
    470             if "=" not in param:
    471                 k, v = param, None
    472             else:
    473                 k, v = re.split(r"\s*=\s*", param, 1)
    474                 k = k.lstrip()
    475             if ii != 0:
    476                 lc = k.lower()
    477                 if lc in known_attrs:
    478                     k = lc
    479                 if k == "version":
    480                     # This is an RFC 2109 cookie.
    481                     v = _strip_quotes(v)
    482                     version_set = True
    483                 if k == "expires":
    484                     # convert expires date to seconds since epoch
    485                     v = http2time(_strip_quotes(v))  # None if invalid
    486             pairs.append((k, v))
    487 
    488         if pairs:
    489             if not version_set:
    490                 pairs.append(("version", "0"))
    491             result.append(pairs)
    492 
    493     return result
    494 
    495 
    496 IPV4_RE = re.compile(r"\.\d+$")
    497 def is_HDN(text):
    498     """Return True if text is a host domain name."""
    499     # XXX
    500     # This may well be wrong.  Which RFC is HDN defined in, if any (for
    501     #  the purposes of RFC 2965)?
    502     # For the current implementation, what about IPv6?  Remember to look
    503     #  at other uses of IPV4_RE also, if change this.
    504     if IPV4_RE.search(text):
    505         return False
    506     if text == "":
    507         return False
    508     if text[0] == "." or text[-1] == ".":
    509         return False
    510     return True
    511 
    512 def domain_match(A, B):
    513     """Return True if domain A domain-matches domain B, according to RFC 2965.
    514 
    515     A and B may be host domain names or IP addresses.
    516 
    517     RFC 2965, section 1:
    518 
    519     Host names can be specified either as an IP address or a HDN string.
    520     Sometimes we compare one host name with another.  (Such comparisons SHALL
    521     be case-insensitive.)  Host A's name domain-matches host B's if
    522 
    523          *  their host name strings string-compare equal; or
    524 
    525          * A is a HDN string and has the form NB, where N is a non-empty
    526             name string, B has the form .B', and B' is a HDN string.  (So,
    527             x.y.com domain-matches .Y.com but not Y.com.)
    528 
    529     Note that domain-match is not a commutative operation: a.b.c.com
    530     domain-matches .c.com, but not the reverse.
    531 
    532     """
    533     # Note that, if A or B are IP addresses, the only relevant part of the
    534     # definition of the domain-match algorithm is the direct string-compare.
    535     A = A.lower()
    536     B = B.lower()
    537     if A == B:
    538         return True
    539     if not is_HDN(A):
    540         return False
    541     i = A.rfind(B)
    542     if i == -1 or i == 0:
    543         # A does not have form NB, or N is the empty string
    544         return False
    545     if not B.startswith("."):
    546         return False
    547     if not is_HDN(B[1:]):
    548         return False
    549     return True
    550 
    551 def liberal_is_HDN(text):
    552     """Return True if text is a sort-of-like a host domain name.
    553 
    554     For accepting/blocking domains.
    555 
    556     """
    557     if IPV4_RE.search(text):
    558         return False
    559     return True
    560 
    561 def user_domain_match(A, B):
    562     """For blocking/accepting domains.
    563 
    564     A and B may be host domain names or IP addresses.
    565 
    566     """
    567     A = A.lower()
    568     B = B.lower()
    569     if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
    570         if A == B:
    571             # equal IP addresses
    572             return True
    573         return False
    574     initial_dot = B.startswith(".")
    575     if initial_dot and A.endswith(B):
    576         return True
    577     if not initial_dot and A == B:
    578         return True
    579     return False
    580 
    581 cut_port_re = re.compile(r":\d+$")
    582 def request_host(request):
    583     """Return request-host, as defined by RFC 2965.
    584 
    585     Variation from RFC: returned value is lowercased, for convenient
    586     comparison.
    587 
    588     """
    589     url = request.get_full_url()
    590     host = urlparse.urlparse(url)[1]
    591     if host == "":
    592         host = request.get_header("Host", "")
    593 
    594     # remove port, if present
    595     host = cut_port_re.sub("", host, 1)
    596     return host.lower()
    597 
    598 def eff_request_host(request):
    599     """Return a tuple (request-host, effective request-host name).
    600 
    601     As defined by RFC 2965, except both are lowercased.
    602 
    603     """
    604     erhn = req_host = request_host(request)
    605     if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
    606         erhn = req_host + ".local"
    607     return req_host, erhn
    608 
    609 def request_path(request):
    610     """Path component of request-URI, as defined by RFC 2965."""
    611     url = request.get_full_url()
    612     parts = urlparse.urlsplit(url)
    613     path = escape_path(parts.path)
    614     if not path.startswith("/"):
    615         # fix bad RFC 2396 absoluteURI
    616         path = "/" + path
    617     return path
    618 
    619 def request_port(request):
    620     host = request.get_host()
    621     i = host.find(':')
    622     if i >= 0:
    623         port = host[i+1:]
    624         try:
    625             int(port)
    626         except ValueError:
    627             _debug("nonnumeric port: '%s'", port)
    628             return None
    629     else:
    630         port = DEFAULT_HTTP_PORT
    631     return port
    632 
    633 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
    634 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
    635 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
    636 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
    637 def uppercase_escaped_char(match):
    638     return "%%%s" % match.group(1).upper()
    639 def escape_path(path):
    640     """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
    641     # There's no knowing what character encoding was used to create URLs
    642     # containing %-escapes, but since we have to pick one to escape invalid
    643     # path characters, we pick UTF-8, as recommended in the HTML 4.0
    644     # specification:
    645     # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
    646     # And here, kind of: draft-fielding-uri-rfc2396bis-03
    647     # (And in draft IRI specification: draft-duerst-iri-05)
    648     # (And here, for new URI schemes: RFC 2718)
    649     if isinstance(path, unicode):
    650         path = path.encode("utf-8")
    651     path = urllib.quote(path, HTTP_PATH_SAFE)
    652     path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
    653     return path
    654 
    655 def reach(h):
    656     """Return reach of host h, as defined by RFC 2965, section 1.
    657 
    658     The reach R of a host name H is defined as follows:
    659 
    660        *  If
    661 
    662           -  H is the host domain name of a host; and,
    663 
    664           -  H has the form A.B; and
    665 
    666           -  A has no embedded (that is, interior) dots; and
    667 
    668           -  B has at least one embedded dot, or B is the string "local".
    669              then the reach of H is .B.
    670 
    671        *  Otherwise, the reach of H is H.
    672 
    673     >>> reach("www.acme.com")
    674     '.acme.com'
    675     >>> reach("acme.com")
    676     'acme.com'
    677     >>> reach("acme.local")
    678     '.local'
    679 
    680     """
    681     i = h.find(".")
    682     if i >= 0:
    683         #a = h[:i]  # this line is only here to show what a is
    684         b = h[i+1:]
    685         i = b.find(".")
    686         if is_HDN(h) and (i >= 0 or b == "local"):
    687             return "."+b
    688     return h
    689 
    690 def is_third_party(request):
    691     """
    692 
    693     RFC 2965, section 3.3.6:
    694 
    695         An unverifiable transaction is to a third-party host if its request-
    696         host U does not domain-match the reach R of the request-host O in the
    697         origin transaction.
    698 
    699     """
    700     req_host = request_host(request)
    701     if not domain_match(req_host, reach(request.get_origin_req_host())):
    702         return True
    703     else:
    704         return False
    705 
    706 
    707 class Cookie:
    708     """HTTP Cookie.
    709 
    710     This class represents both Netscape and RFC 2965 cookies.
    711 
    712     This is deliberately a very simple class.  It just holds attributes.  It's
    713     possible to construct Cookie instances that don't comply with the cookie
    714     standards.  CookieJar.make_cookies is the factory function for Cookie
    715     objects -- it deals with cookie parsing, supplying defaults, and
    716     normalising to the representation used in this class.  CookiePolicy is
    717     responsible for checking them to see whether they should be accepted from
    718     and returned to the server.
    719 
    720     Note that the port may be present in the headers, but unspecified ("Port"
    721     rather than"Port=80", for example); if this is the case, port is None.
    722 
    723     """
    724 
    725     def __init__(self, version, name, value,
    726                  port, port_specified,
    727                  domain, domain_specified, domain_initial_dot,
    728                  path, path_specified,
    729                  secure,
    730                  expires,
    731                  discard,
    732                  comment,
    733                  comment_url,
    734                  rest,
    735                  rfc2109=False,
    736                  ):
    737 
    738         if version is not None: version = int(version)
    739         if expires is not None: expires = int(expires)
    740         if port is None and port_specified is True:
    741             raise ValueError("if port is None, port_specified must be false")
    742 
    743         self.version = version
    744         self.name = name
    745         self.value = value
    746         self.port = port
    747         self.port_specified = port_specified
    748         # normalise case, as per RFC 2965 section 3.3.3
    749         self.domain = domain.lower()
    750         self.domain_specified = domain_specified
    751         # Sigh.  We need to know whether the domain given in the
    752         # cookie-attribute had an initial dot, in order to follow RFC 2965
    753         # (as clarified in draft errata).  Needed for the returned $Domain
    754         # value.
    755         self.domain_initial_dot = domain_initial_dot
    756         self.path = path
    757         self.path_specified = path_specified
    758         self.secure = secure
    759         self.expires = expires
    760         self.discard = discard
    761         self.comment = comment
    762         self.comment_url = comment_url
    763         self.rfc2109 = rfc2109
    764 
    765         self._rest = copy.copy(rest)
    766 
    767     def has_nonstandard_attr(self, name):
    768         return name in self._rest
    769     def get_nonstandard_attr(self, name, default=None):
    770         return self._rest.get(name, default)
    771     def set_nonstandard_attr(self, name, value):
    772         self._rest[name] = value
    773 
    774     def is_expired(self, now=None):
    775         if now is None: now = time.time()
    776         if (self.expires is not None) and (self.expires <= now):
    777             return True
    778         return False
    779 
    780     def __str__(self):
    781         if self.port is None: p = ""
    782         else: p = ":"+self.port
    783         limit = self.domain + p + self.path
    784         if self.value is not None:
    785             namevalue = "%s=%s" % (self.name, self.value)
    786         else:
    787             namevalue = self.name
    788         return "<Cookie %s for %s>" % (namevalue, limit)
    789 
    790     def __repr__(self):
    791         args = []
    792         for name in ("version", "name", "value",
    793                      "port", "port_specified",
    794                      "domain", "domain_specified", "domain_initial_dot",
    795                      "path", "path_specified",
    796                      "secure", "expires", "discard", "comment", "comment_url",
    797                      ):
    798             attr = getattr(self, name)
    799             args.append("%s=%s" % (name, repr(attr)))
    800         args.append("rest=%s" % repr(self._rest))
    801         args.append("rfc2109=%s" % repr(self.rfc2109))
    802         return "Cookie(%s)" % ", ".join(args)
    803 
    804 
    805 class CookiePolicy:
    806     """Defines which cookies get accepted from and returned to server.
    807 
    808     May also modify cookies, though this is probably a bad idea.
    809 
    810     The subclass DefaultCookiePolicy defines the standard rules for Netscape
    811     and RFC 2965 cookies -- override that if you want a customised policy.
    812 
    813     """
    814     def set_ok(self, cookie, request):
    815         """Return true if (and only if) cookie should be accepted from server.
    816 
    817         Currently, pre-expired cookies never get this far -- the CookieJar
    818         class deletes such cookies itself.
    819 
    820         """
    821         raise NotImplementedError()
    822 
    823     def return_ok(self, cookie, request):
    824         """Return true if (and only if) cookie should be returned to server."""
    825         raise NotImplementedError()
    826 
    827     def domain_return_ok(self, domain, request):
    828         """Return false if cookies should not be returned, given cookie domain.
    829         """
    830         return True
    831 
    832     def path_return_ok(self, path, request):
    833         """Return false if cookies should not be returned, given cookie path.
    834         """
    835         return True
    836 
    837 
    838 class DefaultCookiePolicy(CookiePolicy):
    839     """Implements the standard rules for accepting and returning cookies."""
    840 
    841     DomainStrictNoDots = 1
    842     DomainStrictNonDomain = 2
    843     DomainRFC2965Match = 4
    844 
    845     DomainLiberal = 0
    846     DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
    847 
    848     def __init__(self,
    849                  blocked_domains=None, allowed_domains=None,
    850                  netscape=True, rfc2965=False,
    851                  rfc2109_as_netscape=None,
    852                  hide_cookie2=False,
    853                  strict_domain=False,
    854                  strict_rfc2965_unverifiable=True,
    855                  strict_ns_unverifiable=False,
    856                  strict_ns_domain=DomainLiberal,
    857                  strict_ns_set_initial_dollar=False,
    858                  strict_ns_set_path=False,
    859                  ):
    860         """Constructor arguments should be passed as keyword arguments only."""
    861         self.netscape = netscape
    862         self.rfc2965 = rfc2965
    863         self.rfc2109_as_netscape = rfc2109_as_netscape
    864         self.hide_cookie2 = hide_cookie2
    865         self.strict_domain = strict_domain
    866         self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
    867         self.strict_ns_unverifiable = strict_ns_unverifiable
    868         self.strict_ns_domain = strict_ns_domain
    869         self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
    870         self.strict_ns_set_path = strict_ns_set_path
    871 
    872         if blocked_domains is not None:
    873             self._blocked_domains = tuple(blocked_domains)
    874         else:
    875             self._blocked_domains = ()
    876 
    877         if allowed_domains is not None:
    878             allowed_domains = tuple(allowed_domains)
    879         self._allowed_domains = allowed_domains
    880 
    881     def blocked_domains(self):
    882         """Return the sequence of blocked domains (as a tuple)."""
    883         return self._blocked_domains
    884     def set_blocked_domains(self, blocked_domains):
    885         """Set the sequence of blocked domains."""
    886         self._blocked_domains = tuple(blocked_domains)
    887 
    888     def is_blocked(self, domain):
    889         for blocked_domain in self._blocked_domains:
    890             if user_domain_match(domain, blocked_domain):
    891                 return True
    892         return False
    893 
    894     def allowed_domains(self):
    895         """Return None, or the sequence of allowed domains (as a tuple)."""
    896         return self._allowed_domains
    897     def set_allowed_domains(self, allowed_domains):
    898         """Set the sequence of allowed domains, or None."""
    899         if allowed_domains is not None:
    900             allowed_domains = tuple(allowed_domains)
    901         self._allowed_domains = allowed_domains
    902 
    903     def is_not_allowed(self, domain):
    904         if self._allowed_domains is None:
    905             return False
    906         for allowed_domain in self._allowed_domains:
    907             if user_domain_match(domain, allowed_domain):
    908                 return False
    909         return True
    910 
    911     def set_ok(self, cookie, request):
    912         """
    913         If you override .set_ok(), be sure to call this method.  If it returns
    914         false, so should your subclass (assuming your subclass wants to be more
    915         strict about which cookies to accept).
    916 
    917         """
    918         _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
    919 
    920         assert cookie.name is not None
    921 
    922         for n in "version", "verifiability", "name", "path", "domain", "port":
    923             fn_name = "set_ok_"+n
    924             fn = getattr(self, fn_name)
    925             if not fn(cookie, request):
    926                 return False
    927 
    928         return True
    929 
    930     def set_ok_version(self, cookie, request):
    931         if cookie.version is None:
    932             # Version is always set to 0 by parse_ns_headers if it's a Netscape
    933             # cookie, so this must be an invalid RFC 2965 cookie.
    934             _debug("   Set-Cookie2 without version attribute (%s=%s)",
    935                    cookie.name, cookie.value)
    936             return False
    937         if cookie.version > 0 and not self.rfc2965:
    938             _debug("   RFC 2965 cookies are switched off")
    939             return False
    940         elif cookie.version == 0 and not self.netscape:
    941             _debug("   Netscape cookies are switched off")
    942             return False
    943         return True
    944 
    945     def set_ok_verifiability(self, cookie, request):
    946         if request.is_unverifiable() and is_third_party(request):
    947             if cookie.version > 0 and self.strict_rfc2965_unverifiable:
    948                 _debug("   third-party RFC 2965 cookie during "
    949                              "unverifiable transaction")
    950                 return False
    951             elif cookie.version == 0 and self.strict_ns_unverifiable:
    952                 _debug("   third-party Netscape cookie during "
    953                              "unverifiable transaction")
    954                 return False
    955         return True
    956 
    957     def set_ok_name(self, cookie, request):
    958         # Try and stop servers setting V0 cookies designed to hack other
    959         # servers that know both V0 and V1 protocols.
    960         if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
    961             cookie.name.startswith("$")):
    962             _debug("   illegal name (starts with '$'): '%s'", cookie.name)
    963             return False
    964         return True
    965 
    966     def set_ok_path(self, cookie, request):
    967         if cookie.path_specified:
    968             req_path = request_path(request)
    969             if ((cookie.version > 0 or
    970                  (cookie.version == 0 and self.strict_ns_set_path)) and
    971                 not req_path.startswith(cookie.path)):
    972                 _debug("   path attribute %s is not a prefix of request "
    973                        "path %s", cookie.path, req_path)
    974                 return False
    975         return True
    976 
    977     def set_ok_domain(self, cookie, request):
    978         if self.is_blocked(cookie.domain):
    979             _debug("   domain %s is in user block-list", cookie.domain)
    980             return False
    981         if self.is_not_allowed(cookie.domain):
    982             _debug("   domain %s is not in user allow-list", cookie.domain)
    983             return False
    984         if cookie.domain_specified:
    985             req_host, erhn = eff_request_host(request)
    986             domain = cookie.domain
    987             if self.strict_domain and (domain.count(".") >= 2):
    988                 # XXX This should probably be compared with the Konqueror
    989                 # (kcookiejar.cpp) and Mozilla implementations, but it's a
    990                 # losing battle.
    991                 i = domain.rfind(".")
    992                 j = domain.rfind(".", 0, i)
    993                 if j == 0:  # domain like .foo.bar
    994                     tld = domain[i+1:]
    995                     sld = domain[j+1:i]
    996                     if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
    997                        "gov", "mil", "int", "aero", "biz", "cat", "coop",
    998                        "info", "jobs", "mobi", "museum", "name", "pro",
    999                        "travel", "eu") and len(tld) == 2:
   1000                         # domain like .co.uk
   1001                         _debug("   country-code second level domain %s", domain)
   1002                         return False
   1003             if domain.startswith("."):
   1004                 undotted_domain = domain[1:]
   1005             else:
   1006                 undotted_domain = domain
   1007             embedded_dots = (undotted_domain.find(".") >= 0)
   1008             if not embedded_dots and domain != ".local":
   1009                 _debug("   non-local domain %s contains no embedded dot",
   1010                        domain)
   1011                 return False
   1012             if cookie.version == 0:
   1013                 if (not erhn.endswith(domain) and
   1014                     (not erhn.startswith(".") and
   1015                      not ("."+erhn).endswith(domain))):
   1016                     _debug("   effective request-host %s (even with added "
   1017                            "initial dot) does not end with %s",
   1018                            erhn, domain)
   1019                     return False
   1020             if (cookie.version > 0 or
   1021                 (self.strict_ns_domain & self.DomainRFC2965Match)):
   1022                 if not domain_match(erhn, domain):
   1023                     _debug("   effective request-host %s does not domain-match "
   1024                            "%s", erhn, domain)
   1025                     return False
   1026             if (cookie.version > 0 or
   1027                 (self.strict_ns_domain & self.DomainStrictNoDots)):
   1028                 host_prefix = req_host[:-len(domain)]
   1029                 if (host_prefix.find(".") >= 0 and
   1030                     not IPV4_RE.search(req_host)):
   1031                     _debug("   host prefix %s for domain %s contains a dot",
   1032                            host_prefix, domain)
   1033                     return False
   1034         return True
   1035 
   1036     def set_ok_port(self, cookie, request):
   1037         if cookie.port_specified:
   1038             req_port = request_port(request)
   1039             if req_port is None:
   1040                 req_port = "80"
   1041             else:
   1042                 req_port = str(req_port)
   1043             for p in cookie.port.split(","):
   1044                 try:
   1045                     int(p)
   1046                 except ValueError:
   1047                     _debug("   bad port %s (not numeric)", p)
   1048                     return False
   1049                 if p == req_port:
   1050                     break
   1051             else:
   1052                 _debug("   request port (%s) not found in %s",
   1053                        req_port, cookie.port)
   1054                 return False
   1055         return True
   1056 
   1057     def return_ok(self, cookie, request):
   1058         """
   1059         If you override .return_ok(), be sure to call this method.  If it
   1060         returns false, so should your subclass (assuming your subclass wants to
   1061         be more strict about which cookies to return).
   1062 
   1063         """
   1064         # Path has already been checked by .path_return_ok(), and domain
   1065         # blocking done by .domain_return_ok().
   1066         _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
   1067 
   1068         for n in "version", "verifiability", "secure", "expires", "port", "domain":
   1069             fn_name = "return_ok_"+n
   1070             fn = getattr(self, fn_name)
   1071             if not fn(cookie, request):
   1072                 return False
   1073         return True
   1074 
   1075     def return_ok_version(self, cookie, request):
   1076         if cookie.version > 0 and not self.rfc2965:
   1077             _debug("   RFC 2965 cookies are switched off")
   1078             return False
   1079         elif cookie.version == 0 and not self.netscape:
   1080             _debug("   Netscape cookies are switched off")
   1081             return False
   1082         return True
   1083 
   1084     def return_ok_verifiability(self, cookie, request):
   1085         if request.is_unverifiable() and is_third_party(request):
   1086             if cookie.version > 0 and self.strict_rfc2965_unverifiable:
   1087                 _debug("   third-party RFC 2965 cookie during unverifiable "
   1088                        "transaction")
   1089                 return False
   1090             elif cookie.version == 0 and self.strict_ns_unverifiable:
   1091                 _debug("   third-party Netscape cookie during unverifiable "
   1092                        "transaction")
   1093                 return False
   1094         return True
   1095 
   1096     def return_ok_secure(self, cookie, request):
   1097         if cookie.secure and request.get_type() != "https":
   1098             _debug("   secure cookie with non-secure request")
   1099             return False
   1100         return True
   1101 
   1102     def return_ok_expires(self, cookie, request):
   1103         if cookie.is_expired(self._now):
   1104             _debug("   cookie expired")
   1105             return False
   1106         return True
   1107 
   1108     def return_ok_port(self, cookie, request):
   1109         if cookie.port:
   1110             req_port = request_port(request)
   1111             if req_port is None:
   1112                 req_port = "80"
   1113             for p in cookie.port.split(","):
   1114                 if p == req_port:
   1115                     break
   1116             else:
   1117                 _debug("   request port %s does not match cookie port %s",
   1118                        req_port, cookie.port)
   1119                 return False
   1120         return True
   1121 
   1122     def return_ok_domain(self, cookie, request):
   1123         req_host, erhn = eff_request_host(request)
   1124         domain = cookie.domain
   1125 
   1126         # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
   1127         if (cookie.version == 0 and
   1128             (self.strict_ns_domain & self.DomainStrictNonDomain) and
   1129             not cookie.domain_specified and domain != erhn):
   1130             _debug("   cookie with unspecified domain does not string-compare "
   1131                    "equal to request domain")
   1132             return False
   1133 
   1134         if cookie.version > 0 and not domain_match(erhn, domain):
   1135             _debug("   effective request-host name %s does not domain-match "
   1136                    "RFC 2965 cookie domain %s", erhn, domain)
   1137             return False
   1138         if cookie.version == 0 and not ("."+erhn).endswith(domain):
   1139             _debug("   request-host %s does not match Netscape cookie domain "
   1140                    "%s", req_host, domain)
   1141             return False
   1142         return True
   1143 
   1144     def domain_return_ok(self, domain, request):
   1145         # Liberal check of.  This is here as an optimization to avoid
   1146         # having to load lots of MSIE cookie files unless necessary.
   1147         req_host, erhn = eff_request_host(request)
   1148         if not req_host.startswith("."):
   1149             req_host = "."+req_host
   1150         if not erhn.startswith("."):
   1151             erhn = "."+erhn
   1152         if not (req_host.endswith(domain) or erhn.endswith(domain)):
   1153             #_debug("   request domain %s does not match cookie domain %s",
   1154             #       req_host, domain)
   1155             return False
   1156 
   1157         if self.is_blocked(domain):
   1158             _debug("   domain %s is in user block-list", domain)
   1159             return False
   1160         if self.is_not_allowed(domain):
   1161             _debug("   domain %s is not in user allow-list", domain)
   1162             return False
   1163 
   1164         return True
   1165 
   1166     def path_return_ok(self, path, request):
   1167         _debug("- checking cookie path=%s", path)
   1168         req_path = request_path(request)
   1169         if not req_path.startswith(path):
   1170             _debug("  %s does not path-match %s", req_path, path)
   1171             return False
   1172         return True
   1173 
   1174 
   1175 def vals_sorted_by_key(adict):
   1176     keys = adict.keys()
   1177     keys.sort()
   1178     return map(adict.get, keys)
   1179 
   1180 def deepvalues(mapping):
   1181     """Iterates over nested mapping, depth-first, in sorted order by key."""
   1182     values = vals_sorted_by_key(mapping)
   1183     for obj in values:
   1184         mapping = False
   1185         try:
   1186             obj.items
   1187         except AttributeError:
   1188             pass
   1189         else:
   1190             mapping = True
   1191             for subobj in deepvalues(obj):
   1192                 yield subobj
   1193         if not mapping:
   1194             yield obj
   1195 
   1196 
   1197 # Used as second parameter to dict.get() method, to distinguish absent
   1198 # dict key from one with a None value.
   1199 class Absent: pass
   1200 
   1201 class CookieJar:
   1202     """Collection of HTTP cookies.
   1203 
   1204     You may not need to know about this class: try
   1205     urllib2.build_opener(HTTPCookieProcessor).open(url).
   1206 
   1207     """
   1208 
   1209     non_word_re = re.compile(r"\W")
   1210     quote_re = re.compile(r"([\"\\])")
   1211     strict_domain_re = re.compile(r"\.?[^.]*")
   1212     domain_re = re.compile(r"[^.]*")
   1213     dots_re = re.compile(r"^\.+")
   1214 
   1215     magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
   1216 
   1217     def __init__(self, policy=None):
   1218         if policy is None:
   1219             policy = DefaultCookiePolicy()
   1220         self._policy = policy
   1221 
   1222         self._cookies_lock = _threading.RLock()
   1223         self._cookies = {}
   1224 
   1225     def set_policy(self, policy):
   1226         self._policy = policy
   1227 
   1228     def _cookies_for_domain(self, domain, request):
   1229         cookies = []
   1230         if not self._policy.domain_return_ok(domain, request):
   1231             return []
   1232         _debug("Checking %s for cookies to return", domain)
   1233         cookies_by_path = self._cookies[domain]
   1234         for path in cookies_by_path.keys():
   1235             if not self._policy.path_return_ok(path, request):
   1236                 continue
   1237             cookies_by_name = cookies_by_path[path]
   1238             for cookie in cookies_by_name.values():
   1239                 if not self._policy.return_ok(cookie, request):
   1240                     _debug("   not returning cookie")
   1241                     continue
   1242                 _debug("   it's a match")
   1243                 cookies.append(cookie)
   1244         return cookies
   1245 
   1246     def _cookies_for_request(self, request):
   1247         """Return a list of cookies to be returned to server."""
   1248         cookies = []
   1249         for domain in self._cookies.keys():
   1250             cookies.extend(self._cookies_for_domain(domain, request))
   1251         return cookies
   1252 
   1253     def _cookie_attrs(self, cookies):
   1254         """Return a list of cookie-attributes to be returned to server.
   1255 
   1256         like ['foo="bar"; $Path="/"', ...]
   1257 
   1258         The $Version attribute is also added when appropriate (currently only
   1259         once per request).
   1260 
   1261         """
   1262         # add cookies in order of most specific (ie. longest) path first
   1263         cookies.sort(key=lambda arg: len(arg.path), reverse=True)
   1264 
   1265         version_set = False
   1266 
   1267         attrs = []
   1268         for cookie in cookies:
   1269             # set version of Cookie header
   1270             # XXX
   1271             # What should it be if multiple matching Set-Cookie headers have
   1272             #  different versions themselves?
   1273             # Answer: there is no answer; was supposed to be settled by
   1274             #  RFC 2965 errata, but that may never appear...
   1275             version = cookie.version
   1276             if not version_set:
   1277                 version_set = True
   1278                 if version > 0:
   1279                     attrs.append("$Version=%s" % version)
   1280 
   1281             # quote cookie value if necessary
   1282             # (not for Netscape protocol, which already has any quotes
   1283             #  intact, due to the poorly-specified Netscape Cookie: syntax)
   1284             if ((cookie.value is not None) and
   1285                 self.non_word_re.search(cookie.value) and version > 0):
   1286                 value = self.quote_re.sub(r"\\\1", cookie.value)
   1287             else:
   1288                 value = cookie.value
   1289 
   1290             # add cookie-attributes to be returned in Cookie header
   1291             if cookie.value is None:
   1292                 attrs.append(cookie.name)
   1293             else:
   1294                 attrs.append("%s=%s" % (cookie.name, value))
   1295             if version > 0:
   1296                 if cookie.path_specified:
   1297                     attrs.append('$Path="%s"' % cookie.path)
   1298                 if cookie.domain.startswith("."):
   1299                     domain = cookie.domain
   1300                     if (not cookie.domain_initial_dot and
   1301                         domain.startswith(".")):
   1302                         domain = domain[1:]
   1303                     attrs.append('$Domain="%s"' % domain)
   1304                 if cookie.port is not None:
   1305                     p = "$Port"
   1306                     if cookie.port_specified:
   1307                         p = p + ('="%s"' % cookie.port)
   1308                     attrs.append(p)
   1309 
   1310         return attrs
   1311 
   1312     def add_cookie_header(self, request):
   1313         """Add correct Cookie: header to request (urllib2.Request object).
   1314 
   1315         The Cookie2 header is also added unless policy.hide_cookie2 is true.
   1316 
   1317         """
   1318         _debug("add_cookie_header")
   1319         self._cookies_lock.acquire()
   1320         try:
   1321 
   1322             self._policy._now = self._now = int(time.time())
   1323 
   1324             cookies = self._cookies_for_request(request)
   1325 
   1326             attrs = self._cookie_attrs(cookies)
   1327             if attrs:
   1328                 if not request.has_header("Cookie"):
   1329                     request.add_unredirected_header(
   1330                         "Cookie", "; ".join(attrs))
   1331 
   1332             # if necessary, advertise that we know RFC 2965
   1333             if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
   1334                 not request.has_header("Cookie2")):
   1335                 for cookie in cookies:
   1336                     if cookie.version != 1:
   1337                         request.add_unredirected_header("Cookie2", '$Version="1"')
   1338                         break
   1339 
   1340         finally:
   1341             self._cookies_lock.release()
   1342 
   1343         self.clear_expired_cookies()
   1344 
   1345     def _normalized_cookie_tuples(self, attrs_set):
   1346         """Return list of tuples containing normalised cookie information.
   1347 
   1348         attrs_set is the list of lists of key,value pairs extracted from
   1349         the Set-Cookie or Set-Cookie2 headers.
   1350 
   1351         Tuples are name, value, standard, rest, where name and value are the
   1352         cookie name and value, standard is a dictionary containing the standard
   1353         cookie-attributes (discard, secure, version, expires or max-age,
   1354         domain, path and port) and rest is a dictionary containing the rest of
   1355         the cookie-attributes.
   1356 
   1357         """
   1358         cookie_tuples = []
   1359 
   1360         boolean_attrs = "discard", "secure"
   1361         value_attrs = ("version",
   1362                        "expires", "max-age",
   1363                        "domain", "path", "port",
   1364                        "comment", "commenturl")
   1365 
   1366         for cookie_attrs in attrs_set:
   1367             name, value = cookie_attrs[0]
   1368 
   1369             # Build dictionary of standard cookie-attributes (standard) and
   1370             # dictionary of other cookie-attributes (rest).
   1371 
   1372             # Note: expiry time is normalised to seconds since epoch.  V0
   1373             # cookies should have the Expires cookie-attribute, and V1 cookies
   1374             # should have Max-Age, but since V1 includes RFC 2109 cookies (and
   1375             # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
   1376             # accept either (but prefer Max-Age).
   1377             max_age_set = False
   1378 
   1379             bad_cookie = False
   1380 
   1381             standard = {}
   1382             rest = {}
   1383             for k, v in cookie_attrs[1:]:
   1384                 lc = k.lower()
   1385                 # don't lose case distinction for unknown fields
   1386                 if lc in value_attrs or lc in boolean_attrs:
   1387                     k = lc
   1388                 if k in boolean_attrs and v is None:
   1389                     # boolean cookie-attribute is present, but has no value
   1390                     # (like "discard", rather than "port=80")
   1391                     v = True
   1392                 if k in standard:
   1393                     # only first value is significant
   1394                     continue
   1395                 if k == "domain":
   1396                     if v is None:
   1397                         _debug("   missing value for domain attribute")
   1398                         bad_cookie = True
   1399                         break
   1400                     # RFC 2965 section 3.3.3
   1401                     v = v.lower()
   1402                 if k == "expires":
   1403                     if max_age_set:
   1404                         # Prefer max-age to expires (like Mozilla)
   1405                         continue
   1406                     if v is None:
   1407                         _debug("   missing or invalid value for expires "
   1408                               "attribute: treating as session cookie")
   1409                         continue
   1410                 if k == "max-age":
   1411                     max_age_set = True
   1412                     try:
   1413                         v = int(v)
   1414                     except ValueError:
   1415                         _debug("   missing or invalid (non-numeric) value for "
   1416                               "max-age attribute")
   1417                         bad_cookie = True
   1418                         break
   1419                     # convert RFC 2965 Max-Age to seconds since epoch
   1420                     # XXX Strictly you're supposed to follow RFC 2616
   1421                     #   age-calculation rules.  Remember that zero Max-Age is a
   1422                     #   is a request to discard (old and new) cookie, though.
   1423                     k = "expires"
   1424                     v = self._now + v
   1425                 if (k in value_attrs) or (k in boolean_attrs):
   1426                     if (v is None and
   1427                         k not in ("port", "comment", "commenturl")):
   1428                         _debug("   missing value for %s attribute" % k)
   1429                         bad_cookie = True
   1430                         break
   1431                     standard[k] = v
   1432                 else:
   1433                     rest[k] = v
   1434 
   1435             if bad_cookie:
   1436                 continue
   1437 
   1438             cookie_tuples.append((name, value, standard, rest))
   1439 
   1440         return cookie_tuples
   1441 
   1442     def _cookie_from_cookie_tuple(self, tup, request):
   1443         # standard is dict of standard cookie-attributes, rest is dict of the
   1444         # rest of them
   1445         name, value, standard, rest = tup
   1446 
   1447         domain = standard.get("domain", Absent)
   1448         path = standard.get("path", Absent)
   1449         port = standard.get("port", Absent)
   1450         expires = standard.get("expires", Absent)
   1451 
   1452         # set the easy defaults
   1453         version = standard.get("version", None)
   1454         if version is not None:
   1455             try:
   1456                 version = int(version)
   1457             except ValueError:
   1458                 return None  # invalid version, ignore cookie
   1459         secure = standard.get("secure", False)
   1460         # (discard is also set if expires is Absent)
   1461         discard = standard.get("discard", False)
   1462         comment = standard.get("comment", None)
   1463         comment_url = standard.get("commenturl", None)
   1464 
   1465         # set default path
   1466         if path is not Absent and path != "":
   1467             path_specified = True
   1468             path = escape_path(path)
   1469         else:
   1470             path_specified = False
   1471             path = request_path(request)
   1472             i = path.rfind("/")
   1473             if i != -1:
   1474                 if version == 0:
   1475                     # Netscape spec parts company from reality here
   1476                     path = path[:i]
   1477                 else:
   1478                     path = path[:i+1]
   1479             if len(path) == 0: path = "/"
   1480 
   1481         # set default domain
   1482         domain_specified = domain is not Absent
   1483         # but first we have to remember whether it starts with a dot
   1484         domain_initial_dot = False
   1485         if domain_specified:
   1486             domain_initial_dot = bool(domain.startswith("."))
   1487         if domain is Absent:
   1488             req_host, erhn = eff_request_host(request)
   1489             domain = erhn
   1490         elif not domain.startswith("."):
   1491             domain = "."+domain
   1492 
   1493         # set default port
   1494         port_specified = False
   1495         if port is not Absent:
   1496             if port is None:
   1497                 # Port attr present, but has no value: default to request port.
   1498                 # Cookie should then only be sent back on that port.
   1499                 port = request_port(request)
   1500             else:
   1501                 port_specified = True
   1502                 port = re.sub(r"\s+", "", port)
   1503         else:
   1504             # No port attr present.  Cookie can be sent back on any port.
   1505             port = None
   1506 
   1507         # set default expires and discard
   1508         if expires is Absent:
   1509             expires = None
   1510             discard = True
   1511         elif expires <= self._now:
   1512             # Expiry date in past is request to delete cookie.  This can't be
   1513             # in DefaultCookiePolicy, because can't delete cookies there.
   1514             try:
   1515                 self.clear(domain, path, name)
   1516             except KeyError:
   1517                 pass
   1518             _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
   1519                    domain, path, name)
   1520             return None
   1521 
   1522         return Cookie(version,
   1523                       name, value,
   1524                       port, port_specified,
   1525                       domain, domain_specified, domain_initial_dot,
   1526                       path, path_specified,
   1527                       secure,
   1528                       expires,
   1529                       discard,
   1530                       comment,
   1531                       comment_url,
   1532                       rest)
   1533 
   1534     def _cookies_from_attrs_set(self, attrs_set, request):
   1535         cookie_tuples = self._normalized_cookie_tuples(attrs_set)
   1536 
   1537         cookies = []
   1538         for tup in cookie_tuples:
   1539             cookie = self._cookie_from_cookie_tuple(tup, request)
   1540             if cookie: cookies.append(cookie)
   1541         return cookies
   1542 
   1543     def _process_rfc2109_cookies(self, cookies):
   1544         rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
   1545         if rfc2109_as_ns is None:
   1546             rfc2109_as_ns = not self._policy.rfc2965
   1547         for cookie in cookies:
   1548             if cookie.version == 1:
   1549                 cookie.rfc2109 = True
   1550                 if rfc2109_as_ns:
   1551                     # treat 2109 cookies as Netscape cookies rather than
   1552                     # as RFC2965 cookies
   1553                     cookie.version = 0
   1554 
   1555     def make_cookies(self, response, request):
   1556         """Return sequence of Cookie objects extracted from response object."""
   1557         # get cookie-attributes for RFC 2965 and Netscape protocols
   1558         headers = response.info()
   1559         rfc2965_hdrs = headers.getheaders("Set-Cookie2")
   1560         ns_hdrs = headers.getheaders("Set-Cookie")
   1561 
   1562         rfc2965 = self._policy.rfc2965
   1563         netscape = self._policy.netscape
   1564 
   1565         if ((not rfc2965_hdrs and not ns_hdrs) or
   1566             (not ns_hdrs and not rfc2965) or
   1567             (not rfc2965_hdrs and not netscape) or
   1568             (not netscape and not rfc2965)):
   1569             return []  # no relevant cookie headers: quick exit
   1570 
   1571         try:
   1572             cookies = self._cookies_from_attrs_set(
   1573                 split_header_words(rfc2965_hdrs), request)
   1574         except Exception:
   1575             _warn_unhandled_exception()
   1576             cookies = []
   1577 
   1578         if ns_hdrs and netscape:
   1579             try:
   1580                 # RFC 2109 and Netscape cookies
   1581                 ns_cookies = self._cookies_from_attrs_set(
   1582                     parse_ns_headers(ns_hdrs), request)
   1583             except Exception:
   1584                 _warn_unhandled_exception()
   1585                 ns_cookies = []
   1586             self._process_rfc2109_cookies(ns_cookies)
   1587 
   1588             # Look for Netscape cookies (from Set-Cookie headers) that match
   1589             # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
   1590             # For each match, keep the RFC 2965 cookie and ignore the Netscape
   1591             # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
   1592             # bundled in with the Netscape cookies for this purpose, which is
   1593             # reasonable behaviour.
   1594             if rfc2965:
   1595                 lookup = {}
   1596                 for cookie in cookies:
   1597                     lookup[(cookie.domain, cookie.path, cookie.name)] = None
   1598 
   1599                 def no_matching_rfc2965(ns_cookie, lookup=lookup):
   1600                     key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
   1601                     return key not in lookup
   1602                 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
   1603 
   1604             if ns_cookies:
   1605                 cookies.extend(ns_cookies)
   1606 
   1607         return cookies
   1608 
   1609     def set_cookie_if_ok(self, cookie, request):
   1610         """Set a cookie if policy says it's OK to do so."""
   1611         self._cookies_lock.acquire()
   1612         try:
   1613             self._policy._now = self._now = int(time.time())
   1614 
   1615             if self._policy.set_ok(cookie, request):
   1616                 self.set_cookie(cookie)
   1617 
   1618 
   1619         finally:
   1620             self._cookies_lock.release()
   1621 
   1622     def set_cookie(self, cookie):
   1623         """Set a cookie, without checking whether or not it should be set."""
   1624         c = self._cookies
   1625         self._cookies_lock.acquire()
   1626         try:
   1627             if cookie.domain not in c: c[cookie.domain] = {}
   1628             c2 = c[cookie.domain]
   1629             if cookie.path not in c2: c2[cookie.path] = {}
   1630             c3 = c2[cookie.path]
   1631             c3[cookie.name] = cookie
   1632         finally:
   1633             self._cookies_lock.release()
   1634 
   1635     def extract_cookies(self, response, request):
   1636         """Extract cookies from response, where allowable given the request."""
   1637         _debug("extract_cookies: %s", response.info())
   1638         self._cookies_lock.acquire()
   1639         try:
   1640             self._policy._now = self._now = int(time.time())
   1641 
   1642             for cookie in self.make_cookies(response, request):
   1643                 if self._policy.set_ok(cookie, request):
   1644                     _debug(" setting cookie: %s", cookie)
   1645                     self.set_cookie(cookie)
   1646         finally:
   1647             self._cookies_lock.release()
   1648 
   1649     def clear(self, domain=None, path=None, name=None):
   1650         """Clear some cookies.
   1651 
   1652         Invoking this method without arguments will clear all cookies.  If
   1653         given a single argument, only cookies belonging to that domain will be
   1654         removed.  If given two arguments, cookies belonging to the specified
   1655         path within that domain are removed.  If given three arguments, then
   1656         the cookie with the specified name, path and domain is removed.
   1657 
   1658         Raises KeyError if no matching cookie exists.
   1659 
   1660         """
   1661         if name is not None:
   1662             if (domain is None) or (path is None):
   1663                 raise ValueError(
   1664                     "domain and path must be given to remove a cookie by name")
   1665             del self._cookies[domain][path][name]
   1666         elif path is not None:
   1667             if domain is None:
   1668                 raise ValueError(
   1669                     "domain must be given to remove cookies by path")
   1670             del self._cookies[domain][path]
   1671         elif domain is not None:
   1672             del self._cookies[domain]
   1673         else:
   1674             self._cookies = {}
   1675 
   1676     def clear_session_cookies(self):
   1677         """Discard all session cookies.
   1678 
   1679         Note that the .save() method won't save session cookies anyway, unless
   1680         you ask otherwise by passing a true ignore_discard argument.
   1681 
   1682         """
   1683         self._cookies_lock.acquire()
   1684         try:
   1685             for cookie in self:
   1686                 if cookie.discard:
   1687                     self.clear(cookie.domain, cookie.path, cookie.name)
   1688         finally:
   1689             self._cookies_lock.release()
   1690 
   1691     def clear_expired_cookies(self):
   1692         """Discard all expired cookies.
   1693 
   1694         You probably don't need to call this method: expired cookies are never
   1695         sent back to the server (provided you're using DefaultCookiePolicy),
   1696         this method is called by CookieJar itself every so often, and the
   1697         .save() method won't save expired cookies anyway (unless you ask
   1698         otherwise by passing a true ignore_expires argument).
   1699 
   1700         """
   1701         self._cookies_lock.acquire()
   1702         try:
   1703             now = time.time()
   1704             for cookie in self:
   1705                 if cookie.is_expired(now):
   1706                     self.clear(cookie.domain, cookie.path, cookie.name)
   1707         finally:
   1708             self._cookies_lock.release()
   1709 
   1710     def __iter__(self):
   1711         return deepvalues(self._cookies)
   1712 
   1713     def __len__(self):
   1714         """Return number of contained cookies."""
   1715         i = 0
   1716         for cookie in self: i = i + 1
   1717         return i
   1718 
   1719     def __repr__(self):
   1720         r = []
   1721         for cookie in self: r.append(repr(cookie))
   1722         return "<%s[%s]>" % (self.__class__, ", ".join(r))
   1723 
   1724     def __str__(self):
   1725         r = []
   1726         for cookie in self: r.append(str(cookie))
   1727         return "<%s[%s]>" % (self.__class__, ", ".join(r))
   1728 
   1729 
   1730 # derives from IOError for backwards-compatibility with Python 2.4.0
   1731 class LoadError(IOError): pass
   1732 
   1733 class FileCookieJar(CookieJar):
   1734     """CookieJar that can be loaded from and saved to a file."""
   1735 
   1736     def __init__(self, filename=None, delayload=False, policy=None):
   1737         """
   1738         Cookies are NOT loaded from the named file until either the .load() or
   1739         .revert() method is called.
   1740 
   1741         """
   1742         CookieJar.__init__(self, policy)
   1743         if filename is not None:
   1744             try:
   1745                 filename+""
   1746             except:
   1747                 raise ValueError("filename must be string-like")
   1748         self.filename = filename
   1749         self.delayload = bool(delayload)
   1750 
   1751     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
   1752         """Save cookies to a file."""
   1753         raise NotImplementedError()
   1754 
   1755     def load(self, filename=None, ignore_discard=False, ignore_expires=False):
   1756         """Load cookies from a file."""
   1757         if filename is None:
   1758             if self.filename is not None: filename = self.filename
   1759             else: raise ValueError(MISSING_FILENAME_TEXT)
   1760 
   1761         f = open(filename)
   1762         try:
   1763             self._really_load(f, filename, ignore_discard, ignore_expires)
   1764         finally:
   1765             f.close()
   1766 
   1767     def revert(self, filename=None,
   1768                ignore_discard=False, ignore_expires=False):
   1769         """Clear all cookies and reload cookies from a saved file.
   1770 
   1771         Raises LoadError (or IOError) if reversion is not successful; the
   1772         object's state will not be altered if this happens.
   1773 
   1774         """
   1775         if filename is None:
   1776             if self.filename is not None: filename = self.filename
   1777             else: raise ValueError(MISSING_FILENAME_TEXT)
   1778 
   1779         self._cookies_lock.acquire()
   1780         try:
   1781 
   1782             old_state = copy.deepcopy(self._cookies)
   1783             self._cookies = {}
   1784             try:
   1785                 self.load(filename, ignore_discard, ignore_expires)
   1786             except (LoadError, IOError):
   1787                 self._cookies = old_state
   1788                 raise
   1789 
   1790         finally:
   1791             self._cookies_lock.release()
   1792 
   1793 from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
   1794 from _MozillaCookieJar import MozillaCookieJar
   1795