Home | History | Annotate | Download | only in simplejson
      1 """Implementation of JSONDecoder
      2 """
      3 import re
      4 import sys
      5 import struct
      6 
      7 from simplejson.scanner import make_scanner
      8 def _import_c_scanstring():
      9     try:
     10         from simplejson._speedups import scanstring
     11         return scanstring
     12     except ImportError:
     13         return None
     14 c_scanstring = _import_c_scanstring()
     15 
     16 __all__ = ['JSONDecoder']
     17 
     18 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
     19 
     20 def _floatconstants():
     21     _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
     22     # The struct module in Python 2.4 would get frexp() out of range here
     23     # when an endian is specified in the format string. Fixed in Python 2.5+
     24     if sys.byteorder != 'big':
     25         _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
     26     nan, inf = struct.unpack('dd', _BYTES)
     27     return nan, inf, -inf
     28 
     29 NaN, PosInf, NegInf = _floatconstants()
     30 
     31 
     32 class JSONDecodeError(ValueError):
     33     """Subclass of ValueError with the following additional properties:
     34 
     35     msg: The unformatted error message
     36     doc: The JSON document being parsed
     37     pos: The start index of doc where parsing failed
     38     end: The end index of doc where parsing failed (may be None)
     39     lineno: The line corresponding to pos
     40     colno: The column corresponding to pos
     41     endlineno: The line corresponding to end (may be None)
     42     endcolno: The column corresponding to end (may be None)
     43 
     44     """
     45     def __init__(self, msg, doc, pos, end=None):
     46         ValueError.__init__(self, errmsg(msg, doc, pos, end=end))
     47         self.msg = msg
     48         self.doc = doc
     49         self.pos = pos
     50         self.end = end
     51         self.lineno, self.colno = linecol(doc, pos)
     52         if end is not None:
     53             self.endlineno, self.endcolno = linecol(doc, end)
     54         else:
     55             self.endlineno, self.endcolno = None, None
     56 
     57 
     58 def linecol(doc, pos):
     59     lineno = doc.count('\n', 0, pos) + 1
     60     if lineno == 1:
     61         colno = pos
     62     else:
     63         colno = pos - doc.rindex('\n', 0, pos)
     64     return lineno, colno
     65 
     66 
     67 def errmsg(msg, doc, pos, end=None):
     68     # Note that this function is called from _speedups
     69     lineno, colno = linecol(doc, pos)
     70     if end is None:
     71         #fmt = '{0}: line {1} column {2} (char {3})'
     72         #return fmt.format(msg, lineno, colno, pos)
     73         fmt = '%s: line %d column %d (char %d)'
     74         return fmt % (msg, lineno, colno, pos)
     75     endlineno, endcolno = linecol(doc, end)
     76     #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
     77     #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
     78     fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
     79     return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
     80 
     81 
     82 _CONSTANTS = {
     83     '-Infinity': NegInf,
     84     'Infinity': PosInf,
     85     'NaN': NaN,
     86 }
     87 
     88 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
     89 BACKSLASH = {
     90     '"': u'"', '\\': u'\\', '/': u'/',
     91     'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
     92 }
     93 
     94 DEFAULT_ENCODING = "utf-8"
     95 
     96 def py_scanstring(s, end, encoding=None, strict=True,
     97         _b=BACKSLASH, _m=STRINGCHUNK.match):
     98     """Scan the string s for a JSON string. End is the index of the
     99     character in s after the quote that started the JSON string.
    100     Unescapes all valid JSON string escape sequences and raises ValueError
    101     on attempt to decode an invalid string. If strict is False then literal
    102     control characters are allowed in the string.
    103 
    104     Returns a tuple of the decoded string and the index of the character in s
    105     after the end quote."""
    106     if encoding is None:
    107         encoding = DEFAULT_ENCODING
    108     chunks = []
    109     _append = chunks.append
    110     begin = end - 1
    111     while 1:
    112         chunk = _m(s, end)
    113         if chunk is None:
    114             raise JSONDecodeError(
    115                 "Unterminated string starting at", s, begin)
    116         end = chunk.end()
    117         content, terminator = chunk.groups()
    118         # Content is contains zero or more unescaped string characters
    119         if content:
    120             if not isinstance(content, unicode):
    121                 content = unicode(content, encoding)
    122             _append(content)
    123         # Terminator is the end of string, a literal control character,
    124         # or a backslash denoting that an escape sequence follows
    125         if terminator == '"':
    126             break
    127         elif terminator != '\\':
    128             if strict:
    129                 msg = "Invalid control character %r at" % (terminator,)
    130                 #msg = "Invalid control character {0!r} at".format(terminator)
    131                 raise JSONDecodeError(msg, s, end)
    132             else:
    133                 _append(terminator)
    134                 continue
    135         try:
    136             esc = s[end]
    137         except IndexError:
    138             raise JSONDecodeError(
    139                 "Unterminated string starting at", s, begin)
    140         # If not a unicode escape sequence, must be in the lookup table
    141         if esc != 'u':
    142             try:
    143                 char = _b[esc]
    144             except KeyError:
    145                 msg = "Invalid \\escape: " + repr(esc)
    146                 raise JSONDecodeError(msg, s, end)
    147             end += 1
    148         else:
    149             # Unicode escape sequence
    150             esc = s[end + 1:end + 5]
    151             next_end = end + 5
    152             if len(esc) != 4:
    153                 msg = "Invalid \\uXXXX escape"
    154                 raise JSONDecodeError(msg, s, end)
    155             uni = int(esc, 16)
    156             # Check for surrogate pair on UCS-4 systems
    157             if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
    158                 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
    159                 if not s[end + 5:end + 7] == '\\u':
    160                     raise JSONDecodeError(msg, s, end)
    161                 esc2 = s[end + 7:end + 11]
    162                 if len(esc2) != 4:
    163                     raise JSONDecodeError(msg, s, end)
    164                 uni2 = int(esc2, 16)
    165                 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
    166                 next_end += 6
    167             char = unichr(uni)
    168             end = next_end
    169         # Append the unescaped character
    170         _append(char)
    171     return u''.join(chunks), end
    172 
    173 
    174 # Use speedup if available
    175 scanstring = c_scanstring or py_scanstring
    176 
    177 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
    178 WHITESPACE_STR = ' \t\n\r'
    179 
    180 def JSONObject((s, end), encoding, strict, scan_once, object_hook,
    181         object_pairs_hook, memo=None,
    182         _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    183     # Backwards compatibility
    184     if memo is None:
    185         memo = {}
    186     memo_get = memo.setdefault
    187     pairs = []
    188     # Use a slice to prevent IndexError from being raised, the following
    189     # check will raise a more specific ValueError if the string is empty
    190     nextchar = s[end:end + 1]
    191     # Normally we expect nextchar == '"'
    192     if nextchar != '"':
    193         if nextchar in _ws:
    194             end = _w(s, end).end()
    195             nextchar = s[end:end + 1]
    196         # Trivial empty object
    197         if nextchar == '}':
    198             if object_pairs_hook is not None:
    199                 result = object_pairs_hook(pairs)
    200                 return result, end + 1
    201             pairs = {}
    202             if object_hook is not None:
    203                 pairs = object_hook(pairs)
    204             return pairs, end + 1
    205         elif nextchar != '"':
    206             raise JSONDecodeError(
    207                 "Expecting property name enclosed in double quotes",
    208                 s, end)
    209     end += 1
    210     while True:
    211         key, end = scanstring(s, end, encoding, strict)
    212         key = memo_get(key, key)
    213 
    214         # To skip some function call overhead we optimize the fast paths where
    215         # the JSON key separator is ": " or just ":".
    216         if s[end:end + 1] != ':':
    217             end = _w(s, end).end()
    218             if s[end:end + 1] != ':':
    219                 raise JSONDecodeError("Expecting ':' delimiter", s, end)
    220 
    221         end += 1
    222 
    223         try:
    224             if s[end] in _ws:
    225                 end += 1
    226                 if s[end] in _ws:
    227                     end = _w(s, end + 1).end()
    228         except IndexError:
    229             pass
    230 
    231         try:
    232             value, end = scan_once(s, end)
    233         except StopIteration:
    234             raise JSONDecodeError("Expecting object", s, end)
    235         pairs.append((key, value))
    236 
    237         try:
    238             nextchar = s[end]
    239             if nextchar in _ws:
    240                 end = _w(s, end + 1).end()
    241                 nextchar = s[end]
    242         except IndexError:
    243             nextchar = ''
    244         end += 1
    245 
    246         if nextchar == '}':
    247             break
    248         elif nextchar != ',':
    249             raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
    250 
    251         try:
    252             nextchar = s[end]
    253             if nextchar in _ws:
    254                 end += 1
    255                 nextchar = s[end]
    256                 if nextchar in _ws:
    257                     end = _w(s, end + 1).end()
    258                     nextchar = s[end]
    259         except IndexError:
    260             nextchar = ''
    261 
    262         end += 1
    263         if nextchar != '"':
    264             raise JSONDecodeError(
    265                 "Expecting property name enclosed in double quotes",
    266                 s, end - 1)
    267 
    268     if object_pairs_hook is not None:
    269         result = object_pairs_hook(pairs)
    270         return result, end
    271     pairs = dict(pairs)
    272     if object_hook is not None:
    273         pairs = object_hook(pairs)
    274     return pairs, end
    275 
    276 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    277     values = []
    278     nextchar = s[end:end + 1]
    279     if nextchar in _ws:
    280         end = _w(s, end + 1).end()
    281         nextchar = s[end:end + 1]
    282     # Look-ahead for trivial empty array
    283     if nextchar == ']':
    284         return values, end + 1
    285     _append = values.append
    286     while True:
    287         try:
    288             value, end = scan_once(s, end)
    289         except StopIteration:
    290             raise JSONDecodeError("Expecting object", s, end)
    291         _append(value)
    292         nextchar = s[end:end + 1]
    293         if nextchar in _ws:
    294             end = _w(s, end + 1).end()
    295             nextchar = s[end:end + 1]
    296         end += 1
    297         if nextchar == ']':
    298             break
    299         elif nextchar != ',':
    300             raise JSONDecodeError("Expecting ',' delimiter", s, end)
    301 
    302         try:
    303             if s[end] in _ws:
    304                 end += 1
    305                 if s[end] in _ws:
    306                     end = _w(s, end + 1).end()
    307         except IndexError:
    308             pass
    309 
    310     return values, end
    311 
    312 class JSONDecoder(object):
    313     """Simple JSON <http://json.org> decoder
    314 
    315     Performs the following translations in decoding by default:
    316 
    317     +---------------+-------------------+
    318     | JSON          | Python            |
    319     +===============+===================+
    320     | object        | dict              |
    321     +---------------+-------------------+
    322     | array         | list              |
    323     +---------------+-------------------+
    324     | string        | unicode           |
    325     +---------------+-------------------+
    326     | number (int)  | int, long         |
    327     +---------------+-------------------+
    328     | number (real) | float             |
    329     +---------------+-------------------+
    330     | true          | True              |
    331     +---------------+-------------------+
    332     | false         | False             |
    333     +---------------+-------------------+
    334     | null          | None              |
    335     +---------------+-------------------+
    336 
    337     It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
    338     their corresponding ``float`` values, which is outside the JSON spec.
    339 
    340     """
    341 
    342     def __init__(self, encoding=None, object_hook=None, parse_float=None,
    343             parse_int=None, parse_constant=None, strict=True,
    344             object_pairs_hook=None):
    345         """
    346         *encoding* determines the encoding used to interpret any
    347         :class:`str` objects decoded by this instance (``'utf-8'`` by
    348         default).  It has no effect when decoding :class:`unicode` objects.
    349 
    350         Note that currently only encodings that are a superset of ASCII work,
    351         strings of other encodings should be passed in as :class:`unicode`.
    352 
    353         *object_hook*, if specified, will be called with the result of every
    354         JSON object decoded and its return value will be used in place of the
    355         given :class:`dict`.  This can be used to provide custom
    356         deserializations (e.g. to support JSON-RPC class hinting).
    357 
    358         *object_pairs_hook* is an optional function that will be called with
    359         the result of any object literal decode with an ordered list of pairs.
    360         The return value of *object_pairs_hook* will be used instead of the
    361         :class:`dict`.  This feature can be used to implement custom decoders
    362         that rely on the order that the key and value pairs are decoded (for
    363         example, :func:`collections.OrderedDict` will remember the order of
    364         insertion). If *object_hook* is also defined, the *object_pairs_hook*
    365         takes priority.
    366 
    367         *parse_float*, if specified, will be called with the string of every
    368         JSON float to be decoded.  By default, this is equivalent to
    369         ``float(num_str)``. This can be used to use another datatype or parser
    370         for JSON floats (e.g. :class:`decimal.Decimal`).
    371 
    372         *parse_int*, if specified, will be called with the string of every
    373         JSON int to be decoded.  By default, this is equivalent to
    374         ``int(num_str)``.  This can be used to use another datatype or parser
    375         for JSON integers (e.g. :class:`float`).
    376 
    377         *parse_constant*, if specified, will be called with one of the
    378         following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``.  This
    379         can be used to raise an exception if invalid JSON numbers are
    380         encountered.
    381 
    382         *strict* controls the parser's behavior when it encounters an
    383         invalid control character in a string. The default setting of
    384         ``True`` means that unescaped control characters are parse errors, if
    385         ``False`` then control characters will be allowed in strings.
    386 
    387         """
    388         self.encoding = encoding
    389         self.object_hook = object_hook
    390         self.object_pairs_hook = object_pairs_hook
    391         self.parse_float = parse_float or float
    392         self.parse_int = parse_int or int
    393         self.parse_constant = parse_constant or _CONSTANTS.__getitem__
    394         self.strict = strict
    395         self.parse_object = JSONObject
    396         self.parse_array = JSONArray
    397         self.parse_string = scanstring
    398         self.memo = {}
    399         self.scan_once = make_scanner(self)
    400 
    401     def decode(self, s, _w=WHITESPACE.match):
    402         """Return the Python representation of ``s`` (a ``str`` or ``unicode``
    403         instance containing a JSON document)
    404 
    405         """
    406         obj, end = self.raw_decode(s)
    407         end = _w(s, end).end()
    408         if end != len(s):
    409             raise JSONDecodeError("Extra data", s, end, len(s))
    410         return obj
    411 
    412     def raw_decode(self, s, idx=0, _w=WHITESPACE.match):
    413         """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
    414         beginning with a JSON document) and return a 2-tuple of the Python
    415         representation and the index in ``s`` where the document ended.
    416         Optionally, ``idx`` can be used to specify an offset in ``s`` where
    417         the JSON document begins.
    418 
    419         This can be used to decode a JSON document from a string that may
    420         have extraneous data at the end.
    421 
    422         """
    423         try:
    424             obj, end = self.scan_once(s, idx=_w(s, idx).end())
    425         except StopIteration:
    426             raise JSONDecodeError("No JSON object could be decoded", s, idx)
    427         return obj, end
    428