Home | History | Annotate | Download | only in json
      1 """Implementation of JSONDecoder
      2 """
      3 import re
      4 import sys
      5 import struct
      6 
      7 from json import scanner
      8 try:
      9     from _json import scanstring as c_scanstring
     10 except ImportError:
     11     c_scanstring = None
     12 
     13 __all__ = ['JSONDecoder']
     14 
     15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
     16 
     17 def _floatconstants():
     18     _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
     19     if sys.byteorder != 'big':
     20         _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
     21     nan, inf = struct.unpack('dd', _BYTES)
     22     return nan, inf, -inf
     23 
     24 NaN, PosInf, NegInf = _floatconstants()
     25 
     26 
     27 def linecol(doc, pos):
     28     lineno = doc.count('\n', 0, pos) + 1
     29     if lineno == 1:
     30         colno = pos + 1
     31     else:
     32         colno = pos - doc.rindex('\n', 0, pos)
     33     return lineno, colno
     34 
     35 
     36 def errmsg(msg, doc, pos, end=None):
     37     # Note that this function is called from _json
     38     lineno, colno = linecol(doc, pos)
     39     if end is None:
     40         fmt = '{0}: line {1} column {2} (char {3})'
     41         return fmt.format(msg, lineno, colno, pos)
     42         #fmt = '%s: line %d column %d (char %d)'
     43         #return fmt % (msg, lineno, colno, pos)
     44     endlineno, endcolno = linecol(doc, end)
     45     fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
     46     return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
     47     #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
     48     #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
     49 
     50 
     51 _CONSTANTS = {
     52     '-Infinity': NegInf,
     53     'Infinity': PosInf,
     54     'NaN': NaN,
     55 }
     56 
     57 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
     58 BACKSLASH = {
     59     '"': u'"', '\\': u'\\', '/': u'/',
     60     'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
     61 }
     62 
     63 DEFAULT_ENCODING = "utf-8"
     64 
     65 def py_scanstring(s, end, encoding=None, strict=True,
     66         _b=BACKSLASH, _m=STRINGCHUNK.match):
     67     """Scan the string s for a JSON string. End is the index of the
     68     character in s after the quote that started the JSON string.
     69     Unescapes all valid JSON string escape sequences and raises ValueError
     70     on attempt to decode an invalid string. If strict is False then literal
     71     control characters are allowed in the string.
     72 
     73     Returns a tuple of the decoded string and the index of the character in s
     74     after the end quote."""
     75     if encoding is None:
     76         encoding = DEFAULT_ENCODING
     77     chunks = []
     78     _append = chunks.append
     79     begin = end - 1
     80     while 1:
     81         chunk = _m(s, end)
     82         if chunk is None:
     83             raise ValueError(
     84                 errmsg("Unterminated string starting at", s, begin))
     85         end = chunk.end()
     86         content, terminator = chunk.groups()
     87         # Content is contains zero or more unescaped string characters
     88         if content:
     89             if not isinstance(content, unicode):
     90                 content = unicode(content, encoding)
     91             _append(content)
     92         # Terminator is the end of string, a literal control character,
     93         # or a backslash denoting that an escape sequence follows
     94         if terminator == '"':
     95             break
     96         elif terminator != '\\':
     97             if strict:
     98                 #msg = "Invalid control character %r at" % (terminator,)
     99                 msg = "Invalid control character {0!r} at".format(terminator)
    100                 raise ValueError(errmsg(msg, s, end))
    101             else:
    102                 _append(terminator)
    103                 continue
    104         try:
    105             esc = s[end]
    106         except IndexError:
    107             raise ValueError(
    108                 errmsg("Unterminated string starting at", s, begin))
    109         # If not a unicode escape sequence, must be in the lookup table
    110         if esc != 'u':
    111             try:
    112                 char = _b[esc]
    113             except KeyError:
    114                 msg = "Invalid \\escape: " + repr(esc)
    115                 raise ValueError(errmsg(msg, s, end))
    116             end += 1
    117         else:
    118             # Unicode escape sequence
    119             esc = s[end + 1:end + 5]
    120             next_end = end + 5
    121             if len(esc) != 4:
    122                 msg = "Invalid \\uXXXX escape"
    123                 raise ValueError(errmsg(msg, s, end))
    124             uni = int(esc, 16)
    125             # Check for surrogate pair on UCS-4 systems
    126             if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
    127                 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
    128                 if not s[end + 5:end + 7] == '\\u':
    129                     raise ValueError(errmsg(msg, s, end))
    130                 esc2 = s[end + 7:end + 11]
    131                 if len(esc2) != 4:
    132                     raise ValueError(errmsg(msg, s, end))
    133                 uni2 = int(esc2, 16)
    134                 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
    135                 next_end += 6
    136             char = unichr(uni)
    137             end = next_end
    138         # Append the unescaped character
    139         _append(char)
    140     return u''.join(chunks), end
    141 
    142 
    143 # Use speedup if available
    144 scanstring = c_scanstring or py_scanstring
    145 
    146 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
    147 WHITESPACE_STR = ' \t\n\r'
    148 
    149 def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
    150                object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    151     s, end = s_and_end
    152     pairs = []
    153     pairs_append = pairs.append
    154     # Use a slice to prevent IndexError from being raised, the following
    155     # check will raise a more specific ValueError if the string is empty
    156     nextchar = s[end:end + 1]
    157     # Normally we expect nextchar == '"'
    158     if nextchar != '"':
    159         if nextchar in _ws:
    160             end = _w(s, end).end()
    161             nextchar = s[end:end + 1]
    162         # Trivial empty object
    163         if nextchar == '}':
    164             if object_pairs_hook is not None:
    165                 result = object_pairs_hook(pairs)
    166                 return result, end + 1
    167             pairs = {}
    168             if object_hook is not None:
    169                 pairs = object_hook(pairs)
    170             return pairs, end + 1
    171         elif nextchar != '"':
    172             raise ValueError(errmsg(
    173                 "Expecting property name enclosed in double quotes", s, end))
    174     end += 1
    175     while True:
    176         key, end = scanstring(s, end, encoding, strict)
    177 
    178         # To skip some function call overhead we optimize the fast paths where
    179         # the JSON key separator is ": " or just ":".
    180         if s[end:end + 1] != ':':
    181             end = _w(s, end).end()
    182             if s[end:end + 1] != ':':
    183                 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
    184         end += 1
    185 
    186         try:
    187             if s[end] in _ws:
    188                 end += 1
    189                 if s[end] in _ws:
    190                     end = _w(s, end + 1).end()
    191         except IndexError:
    192             pass
    193 
    194         try:
    195             value, end = scan_once(s, end)
    196         except StopIteration:
    197             raise ValueError(errmsg("Expecting object", s, end))
    198         pairs_append((key, value))
    199 
    200         try:
    201             nextchar = s[end]
    202             if nextchar in _ws:
    203                 end = _w(s, end + 1).end()
    204                 nextchar = s[end]
    205         except IndexError:
    206             nextchar = ''
    207         end += 1
    208 
    209         if nextchar == '}':
    210             break
    211         elif nextchar != ',':
    212             raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
    213 
    214         try:
    215             nextchar = s[end]
    216             if nextchar in _ws:
    217                 end += 1
    218                 nextchar = s[end]
    219                 if nextchar in _ws:
    220                     end = _w(s, end + 1).end()
    221                     nextchar = s[end]
    222         except IndexError:
    223             nextchar = ''
    224 
    225         end += 1
    226         if nextchar != '"':
    227             raise ValueError(errmsg(
    228                 "Expecting property name enclosed in double quotes", s, end - 1))
    229     if object_pairs_hook is not None:
    230         result = object_pairs_hook(pairs)
    231         return result, end
    232     pairs = dict(pairs)
    233     if object_hook is not None:
    234         pairs = object_hook(pairs)
    235     return pairs, end
    236 
    237 def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    238     s, end = s_and_end
    239     values = []
    240     nextchar = s[end:end + 1]
    241     if nextchar in _ws:
    242         end = _w(s, end + 1).end()
    243         nextchar = s[end:end + 1]
    244     # Look-ahead for trivial empty array
    245     if nextchar == ']':
    246         return values, end + 1
    247     _append = values.append
    248     while True:
    249         try:
    250             value, end = scan_once(s, end)
    251         except StopIteration:
    252             raise ValueError(errmsg("Expecting object", s, end))
    253         _append(value)
    254         nextchar = s[end:end + 1]
    255         if nextchar in _ws:
    256             end = _w(s, end + 1).end()
    257             nextchar = s[end:end + 1]
    258         end += 1
    259         if nextchar == ']':
    260             break
    261         elif nextchar != ',':
    262             raise ValueError(errmsg("Expecting ',' delimiter", s, end))
    263         try:
    264             if s[end] in _ws:
    265                 end += 1
    266                 if s[end] in _ws:
    267                     end = _w(s, end + 1).end()
    268         except IndexError:
    269             pass
    270 
    271     return values, end
    272 
    273 class JSONDecoder(object):
    274     """Simple JSON <http://json.org> decoder
    275 
    276     Performs the following translations in decoding by default:
    277 
    278     +---------------+-------------------+
    279     | JSON          | Python            |
    280     +===============+===================+
    281     | object        | dict              |
    282     +---------------+-------------------+
    283     | array         | list              |
    284     +---------------+-------------------+
    285     | string        | unicode           |
    286     +---------------+-------------------+
    287     | number (int)  | int, long         |
    288     +---------------+-------------------+
    289     | number (real) | float             |
    290     +---------------+-------------------+
    291     | true          | True              |
    292     +---------------+-------------------+
    293     | false         | False             |
    294     +---------------+-------------------+
    295     | null          | None              |
    296     +---------------+-------------------+
    297 
    298     It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
    299     their corresponding ``float`` values, which is outside the JSON spec.
    300 
    301     """
    302 
    303     def __init__(self, encoding=None, object_hook=None, parse_float=None,
    304             parse_int=None, parse_constant=None, strict=True,
    305             object_pairs_hook=None):
    306         """``encoding`` determines the encoding used to interpret any ``str``
    307         objects decoded by this instance (utf-8 by default).  It has no
    308         effect when decoding ``unicode`` objects.
    309 
    310         Note that currently only encodings that are a superset of ASCII work,
    311         strings of other encodings should be passed in as ``unicode``.
    312 
    313         ``object_hook``, if specified, will be called with the result
    314         of every JSON object decoded and its return value will be used in
    315         place of the given ``dict``.  This can be used to provide custom
    316         deserializations (e.g. to support JSON-RPC class hinting).
    317 
    318         ``object_pairs_hook``, if specified will be called with the result of
    319         every JSON object decoded with an ordered list of pairs.  The return
    320         value of ``object_pairs_hook`` will be used instead of the ``dict``.
    321         This feature can be used to implement custom decoders that rely on the
    322         order that the key and value pairs are decoded (for example,
    323         collections.OrderedDict will remember the order of insertion). If
    324         ``object_hook`` is also defined, the ``object_pairs_hook`` takes
    325         priority.
    326 
    327         ``parse_float``, if specified, will be called with the string
    328         of every JSON float to be decoded. By default this is equivalent to
    329         float(num_str). This can be used to use another datatype or parser
    330         for JSON floats (e.g. decimal.Decimal).
    331 
    332         ``parse_int``, if specified, will be called with the string
    333         of every JSON int to be decoded. By default this is equivalent to
    334         int(num_str). This can be used to use another datatype or parser
    335         for JSON integers (e.g. float).
    336 
    337         ``parse_constant``, if specified, will be called with one of the
    338         following strings: -Infinity, Infinity, NaN.
    339         This can be used to raise an exception if invalid JSON numbers
    340         are encountered.
    341 
    342         If ``strict`` is false (true is the default), then control
    343         characters will be allowed inside strings.  Control characters in
    344         this context are those with character codes in the 0-31 range,
    345         including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
    346 
    347         """
    348         self.encoding = encoding
    349         self.object_hook = object_hook
    350         self.object_pairs_hook = object_pairs_hook
    351         self.parse_float = parse_float or float
    352         self.parse_int = parse_int or int
    353         self.parse_constant = parse_constant or _CONSTANTS.__getitem__
    354         self.strict = strict
    355         self.parse_object = JSONObject
    356         self.parse_array = JSONArray
    357         self.parse_string = scanstring
    358         self.scan_once = scanner.make_scanner(self)
    359 
    360     def decode(self, s, _w=WHITESPACE.match):
    361         """Return the Python representation of ``s`` (a ``str`` or ``unicode``
    362         instance containing a JSON document)
    363 
    364         """
    365         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    366         end = _w(s, end).end()
    367         if end != len(s):
    368             raise ValueError(errmsg("Extra data", s, end, len(s)))
    369         return obj
    370 
    371     def raw_decode(self, s, idx=0):
    372         """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
    373         beginning with a JSON document) and return a 2-tuple of the Python
    374         representation and the index in ``s`` where the document ended.
    375 
    376         This can be used to decode a JSON document from a string that may
    377         have extraneous data at the end.
    378 
    379         """
    380         try:
    381             obj, end = self.scan_once(s, idx)
    382         except StopIteration:
    383             raise ValueError("No JSON object could be decoded")
    384         return obj, end
    385