Home | History | Annotate | Download | only in json
      1 """Implementation of JSONDecoder
      2 """
      3 import re
      4 import sys
      5 import struct
      6 
      7 from json import scanner
      8 try:
      9     from _json import scanstring as c_scanstring
     10 except ImportError:
     11     c_scanstring = None
     12 
     13 __all__ = ['JSONDecoder']
     14 
     15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
     16 
     17 def _floatconstants():
     18     nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
     19     inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
     20     return nan, inf, -inf
     21 
     22 NaN, PosInf, NegInf = _floatconstants()
     23 
     24 
     25 def linecol(doc, pos):
     26     lineno = doc.count('\n', 0, pos) + 1
     27     if lineno == 1:
     28         colno = pos + 1
     29     else:
     30         colno = pos - doc.rindex('\n', 0, pos)
     31     return lineno, colno
     32 
     33 
     34 def errmsg(msg, doc, pos, end=None):
     35     # Note that this function is called from _json
     36     lineno, colno = linecol(doc, pos)
     37     if end is None:
     38         fmt = '{0}: line {1} column {2} (char {3})'
     39         return fmt.format(msg, lineno, colno, pos)
     40         #fmt = '%s: line %d column %d (char %d)'
     41         #return fmt % (msg, lineno, colno, pos)
     42     endlineno, endcolno = linecol(doc, end)
     43     fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
     44     return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
     45     #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
     46     #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
     47 
     48 
     49 _CONSTANTS = {
     50     '-Infinity': NegInf,
     51     'Infinity': PosInf,
     52     'NaN': NaN,
     53 }
     54 
     55 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
     56 BACKSLASH = {
     57     '"': u'"', '\\': u'\\', '/': u'/',
     58     'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
     59 }
     60 
     61 DEFAULT_ENCODING = "utf-8"
     62 
     63 def _decode_uXXXX(s, pos):
     64     esc = s[pos + 1:pos + 5]
     65     if len(esc) == 4 and esc[1] not in 'xX':
     66         try:
     67             return int(esc, 16)
     68         except ValueError:
     69             pass
     70     msg = "Invalid \\uXXXX escape"
     71     raise ValueError(errmsg(msg, s, pos))
     72 
     73 def py_scanstring(s, end, encoding=None, strict=True,
     74         _b=BACKSLASH, _m=STRINGCHUNK.match):
     75     """Scan the string s for a JSON string. End is the index of the
     76     character in s after the quote that started the JSON string.
     77     Unescapes all valid JSON string escape sequences and raises ValueError
     78     on attempt to decode an invalid string. If strict is False then literal
     79     control characters are allowed in the string.
     80 
     81     Returns a tuple of the decoded string and the index of the character in s
     82     after the end quote."""
     83     if encoding is None:
     84         encoding = DEFAULT_ENCODING
     85     chunks = []
     86     _append = chunks.append
     87     begin = end - 1
     88     while 1:
     89         chunk = _m(s, end)
     90         if chunk is None:
     91             raise ValueError(
     92                 errmsg("Unterminated string starting at", s, begin))
     93         end = chunk.end()
     94         content, terminator = chunk.groups()
     95         # Content is contains zero or more unescaped string characters
     96         if content:
     97             if not isinstance(content, unicode):
     98                 content = unicode(content, encoding)
     99             _append(content)
    100         # Terminator is the end of string, a literal control character,
    101         # or a backslash denoting that an escape sequence follows
    102         if terminator == '"':
    103             break
    104         elif terminator != '\\':
    105             if strict:
    106                 #msg = "Invalid control character %r at" % (terminator,)
    107                 msg = "Invalid control character {0!r} at".format(terminator)
    108                 raise ValueError(errmsg(msg, s, end))
    109             else:
    110                 _append(terminator)
    111                 continue
    112         try:
    113             esc = s[end]
    114         except IndexError:
    115             raise ValueError(
    116                 errmsg("Unterminated string starting at", s, begin))
    117         # If not a unicode escape sequence, must be in the lookup table
    118         if esc != 'u':
    119             try:
    120                 char = _b[esc]
    121             except KeyError:
    122                 msg = "Invalid \\escape: " + repr(esc)
    123                 raise ValueError(errmsg(msg, s, end))
    124             end += 1
    125         else:
    126             # Unicode escape sequence
    127             uni = _decode_uXXXX(s, end)
    128             end += 5
    129             # Check for surrogate pair on UCS-4 systems
    130             if sys.maxunicode > 65535 and \
    131                0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
    132                 uni2 = _decode_uXXXX(s, end + 1)
    133                 if 0xdc00 <= uni2 <= 0xdfff:
    134                     uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
    135                     end += 6
    136             char = unichr(uni)
    137         # Append the unescaped character
    138         _append(char)
    139     return u''.join(chunks), end
    140 
    141 
    142 # Use speedup if available
    143 scanstring = c_scanstring or py_scanstring
    144 
    145 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
    146 WHITESPACE_STR = ' \t\n\r'
    147 
    148 def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
    149                object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    150     s, end = s_and_end
    151     pairs = []
    152     pairs_append = pairs.append
    153     # Use a slice to prevent IndexError from being raised, the following
    154     # check will raise a more specific ValueError if the string is empty
    155     nextchar = s[end:end + 1]
    156     # Normally we expect nextchar == '"'
    157     if nextchar != '"':
    158         if nextchar in _ws:
    159             end = _w(s, end).end()
    160             nextchar = s[end:end + 1]
    161         # Trivial empty object
    162         if nextchar == '}':
    163             if object_pairs_hook is not None:
    164                 result = object_pairs_hook(pairs)
    165                 return result, end + 1
    166             pairs = {}
    167             if object_hook is not None:
    168                 pairs = object_hook(pairs)
    169             return pairs, end + 1
    170         elif nextchar != '"':
    171             raise ValueError(errmsg(
    172                 "Expecting property name enclosed in double quotes", s, end))
    173     end += 1
    174     while True:
    175         key, end = scanstring(s, end, encoding, strict)
    176 
    177         # To skip some function call overhead we optimize the fast paths where
    178         # the JSON key separator is ": " or just ":".
    179         if s[end:end + 1] != ':':
    180             end = _w(s, end).end()
    181             if s[end:end + 1] != ':':
    182                 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
    183         end += 1
    184 
    185         try:
    186             if s[end] in _ws:
    187                 end += 1
    188                 if s[end] in _ws:
    189                     end = _w(s, end + 1).end()
    190         except IndexError:
    191             pass
    192 
    193         try:
    194             value, end = scan_once(s, end)
    195         except StopIteration:
    196             raise ValueError(errmsg("Expecting object", s, end))
    197         pairs_append((key, value))
    198 
    199         try:
    200             nextchar = s[end]
    201             if nextchar in _ws:
    202                 end = _w(s, end + 1).end()
    203                 nextchar = s[end]
    204         except IndexError:
    205             nextchar = ''
    206         end += 1
    207 
    208         if nextchar == '}':
    209             break
    210         elif nextchar != ',':
    211             raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
    212 
    213         try:
    214             nextchar = s[end]
    215             if nextchar in _ws:
    216                 end += 1
    217                 nextchar = s[end]
    218                 if nextchar in _ws:
    219                     end = _w(s, end + 1).end()
    220                     nextchar = s[end]
    221         except IndexError:
    222             nextchar = ''
    223 
    224         end += 1
    225         if nextchar != '"':
    226             raise ValueError(errmsg(
    227                 "Expecting property name enclosed in double quotes", s, end - 1))
    228     if object_pairs_hook is not None:
    229         result = object_pairs_hook(pairs)
    230         return result, end
    231     pairs = dict(pairs)
    232     if object_hook is not None:
    233         pairs = object_hook(pairs)
    234     return pairs, end
    235 
    236 def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    237     s, end = s_and_end
    238     values = []
    239     nextchar = s[end:end + 1]
    240     if nextchar in _ws:
    241         end = _w(s, end + 1).end()
    242         nextchar = s[end:end + 1]
    243     # Look-ahead for trivial empty array
    244     if nextchar == ']':
    245         return values, end + 1
    246     _append = values.append
    247     while True:
    248         try:
    249             value, end = scan_once(s, end)
    250         except StopIteration:
    251             raise ValueError(errmsg("Expecting object", s, end))
    252         _append(value)
    253         nextchar = s[end:end + 1]
    254         if nextchar in _ws:
    255             end = _w(s, end + 1).end()
    256             nextchar = s[end:end + 1]
    257         end += 1
    258         if nextchar == ']':
    259             break
    260         elif nextchar != ',':
    261             raise ValueError(errmsg("Expecting ',' delimiter", s, end))
    262         try:
    263             if s[end] in _ws:
    264                 end += 1
    265                 if s[end] in _ws:
    266                     end = _w(s, end + 1).end()
    267         except IndexError:
    268             pass
    269 
    270     return values, end
    271 
    272 class JSONDecoder(object):
    273     """Simple JSON <http://json.org> decoder
    274 
    275     Performs the following translations in decoding by default:
    276 
    277     +---------------+-------------------+
    278     | JSON          | Python            |
    279     +===============+===================+
    280     | object        | dict              |
    281     +---------------+-------------------+
    282     | array         | list              |
    283     +---------------+-------------------+
    284     | string        | unicode           |
    285     +---------------+-------------------+
    286     | number (int)  | int, long         |
    287     +---------------+-------------------+
    288     | number (real) | float             |
    289     +---------------+-------------------+
    290     | true          | True              |
    291     +---------------+-------------------+
    292     | false         | False             |
    293     +---------------+-------------------+
    294     | null          | None              |
    295     +---------------+-------------------+
    296 
    297     It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
    298     their corresponding ``float`` values, which is outside the JSON spec.
    299 
    300     """
    301 
    302     def __init__(self, encoding=None, object_hook=None, parse_float=None,
    303             parse_int=None, parse_constant=None, strict=True,
    304             object_pairs_hook=None):
    305         """``encoding`` determines the encoding used to interpret any ``str``
    306         objects decoded by this instance (utf-8 by default).  It has no
    307         effect when decoding ``unicode`` objects.
    308 
    309         Note that currently only encodings that are a superset of ASCII work,
    310         strings of other encodings should be passed in as ``unicode``.
    311 
    312         ``object_hook``, if specified, will be called with the result
    313         of every JSON object decoded and its return value will be used in
    314         place of the given ``dict``.  This can be used to provide custom
    315         deserializations (e.g. to support JSON-RPC class hinting).
    316 
    317         ``object_pairs_hook``, if specified will be called with the result of
    318         every JSON object decoded with an ordered list of pairs.  The return
    319         value of ``object_pairs_hook`` will be used instead of the ``dict``.
    320         This feature can be used to implement custom decoders that rely on the
    321         order that the key and value pairs are decoded (for example,
    322         collections.OrderedDict will remember the order of insertion). If
    323         ``object_hook`` is also defined, the ``object_pairs_hook`` takes
    324         priority.
    325 
    326         ``parse_float``, if specified, will be called with the string
    327         of every JSON float to be decoded. By default this is equivalent to
    328         float(num_str). This can be used to use another datatype or parser
    329         for JSON floats (e.g. decimal.Decimal).
    330 
    331         ``parse_int``, if specified, will be called with the string
    332         of every JSON int to be decoded. By default this is equivalent to
    333         int(num_str). This can be used to use another datatype or parser
    334         for JSON integers (e.g. float).
    335 
    336         ``parse_constant``, if specified, will be called with one of the
    337         following strings: -Infinity, Infinity, NaN.
    338         This can be used to raise an exception if invalid JSON numbers
    339         are encountered.
    340 
    341         If ``strict`` is false (true is the default), then control
    342         characters will be allowed inside strings.  Control characters in
    343         this context are those with character codes in the 0-31 range,
    344         including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
    345 
    346         """
    347         self.encoding = encoding
    348         self.object_hook = object_hook
    349         self.object_pairs_hook = object_pairs_hook
    350         self.parse_float = parse_float or float
    351         self.parse_int = parse_int or int
    352         self.parse_constant = parse_constant or _CONSTANTS.__getitem__
    353         self.strict = strict
    354         self.parse_object = JSONObject
    355         self.parse_array = JSONArray
    356         self.parse_string = scanstring
    357         self.scan_once = scanner.make_scanner(self)
    358 
    359     def decode(self, s, _w=WHITESPACE.match):
    360         """Return the Python representation of ``s`` (a ``str`` or ``unicode``
    361         instance containing a JSON document)
    362 
    363         """
    364         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    365         end = _w(s, end).end()
    366         if end != len(s):
    367             raise ValueError(errmsg("Extra data", s, end, len(s)))
    368         return obj
    369 
    370     def raw_decode(self, s, idx=0):
    371         """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
    372         beginning with a JSON document) and return a 2-tuple of the Python
    373         representation and the index in ``s`` where the document ended.
    374 
    375         This can be used to decode a JSON document from a string that may
    376         have extraneous data at the end.
    377 
    378         """
    379         try:
    380             obj, end = self.scan_once(s, idx)
    381         except StopIteration:
    382             raise ValueError("No JSON object could be decoded")
    383         return obj, end
    384