1 """Implementation of JSONDecoder 2 """ 3 import re 4 import sys 5 import struct 6 7 from simplejson.scanner import make_scanner 8 def _import_c_scanstring(): 9 try: 10 from simplejson._speedups import scanstring 11 return scanstring 12 except ImportError: 13 return None 14 c_scanstring = _import_c_scanstring() 15 16 __all__ = ['JSONDecoder'] 17 18 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL 19 20 def _floatconstants(): 21 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') 22 # The struct module in Python 2.4 would get frexp() out of range here 23 # when an endian is specified in the format string. Fixed in Python 2.5+ 24 if sys.byteorder != 'big': 25 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] 26 nan, inf = struct.unpack('dd', _BYTES) 27 return nan, inf, -inf 28 29 NaN, PosInf, NegInf = _floatconstants() 30 31 32 class JSONDecodeError(ValueError): 33 """Subclass of ValueError with the following additional properties: 34 35 msg: The unformatted error message 36 doc: The JSON document being parsed 37 pos: The start index of doc where parsing failed 38 end: The end index of doc where parsing failed (may be None) 39 lineno: The line corresponding to pos 40 colno: The column corresponding to pos 41 endlineno: The line corresponding to end (may be None) 42 endcolno: The column corresponding to end (may be None) 43 44 """ 45 def __init__(self, msg, doc, pos, end=None): 46 ValueError.__init__(self, errmsg(msg, doc, pos, end=end)) 47 self.msg = msg 48 self.doc = doc 49 self.pos = pos 50 self.end = end 51 self.lineno, self.colno = linecol(doc, pos) 52 if end is not None: 53 self.endlineno, self.endcolno = linecol(doc, end) 54 else: 55 self.endlineno, self.endcolno = None, None 56 57 58 def linecol(doc, pos): 59 lineno = doc.count('\n', 0, pos) + 1 60 if lineno == 1: 61 colno = pos 62 else: 63 colno = pos - doc.rindex('\n', 0, pos) 64 return lineno, colno 65 66 67 def errmsg(msg, doc, pos, end=None): 68 # Note that this function is called from _speedups 69 lineno, colno = linecol(doc, pos) 70 if end is None: 71 #fmt = '{0}: line {1} column {2} (char {3})' 72 #return fmt.format(msg, lineno, colno, pos) 73 fmt = '%s: line %d column %d (char %d)' 74 return fmt % (msg, lineno, colno, pos) 75 endlineno, endcolno = linecol(doc, end) 76 #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' 77 #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) 78 fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' 79 return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) 80 81 82 _CONSTANTS = { 83 '-Infinity': NegInf, 84 'Infinity': PosInf, 85 'NaN': NaN, 86 } 87 88 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) 89 BACKSLASH = { 90 '"': u'"', '\\': u'\\', '/': u'/', 91 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', 92 } 93 94 DEFAULT_ENCODING = "utf-8" 95 96 def py_scanstring(s, end, encoding=None, strict=True, 97 _b=BACKSLASH, _m=STRINGCHUNK.match): 98 """Scan the string s for a JSON string. End is the index of the 99 character in s after the quote that started the JSON string. 100 Unescapes all valid JSON string escape sequences and raises ValueError 101 on attempt to decode an invalid string. If strict is False then literal 102 control characters are allowed in the string. 103 104 Returns a tuple of the decoded string and the index of the character in s 105 after the end quote.""" 106 if encoding is None: 107 encoding = DEFAULT_ENCODING 108 chunks = [] 109 _append = chunks.append 110 begin = end - 1 111 while 1: 112 chunk = _m(s, end) 113 if chunk is None: 114 raise JSONDecodeError( 115 "Unterminated string starting at", s, begin) 116 end = chunk.end() 117 content, terminator = chunk.groups() 118 # Content is contains zero or more unescaped string characters 119 if content: 120 if not isinstance(content, unicode): 121 content = unicode(content, encoding) 122 _append(content) 123 # Terminator is the end of string, a literal control character, 124 # or a backslash denoting that an escape sequence follows 125 if terminator == '"': 126 break 127 elif terminator != '\\': 128 if strict: 129 msg = "Invalid control character %r at" % (terminator,) 130 #msg = "Invalid control character {0!r} at".format(terminator) 131 raise JSONDecodeError(msg, s, end) 132 else: 133 _append(terminator) 134 continue 135 try: 136 esc = s[end] 137 except IndexError: 138 raise JSONDecodeError( 139 "Unterminated string starting at", s, begin) 140 # If not a unicode escape sequence, must be in the lookup table 141 if esc != 'u': 142 try: 143 char = _b[esc] 144 except KeyError: 145 msg = "Invalid \\escape: " + repr(esc) 146 raise JSONDecodeError(msg, s, end) 147 end += 1 148 else: 149 # Unicode escape sequence 150 esc = s[end + 1:end + 5] 151 next_end = end + 5 152 if len(esc) != 4: 153 msg = "Invalid \\uXXXX escape" 154 raise JSONDecodeError(msg, s, end) 155 uni = int(esc, 16) 156 # Check for surrogate pair on UCS-4 systems 157 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: 158 msg = "Invalid \\uXXXX\\uXXXX surrogate pair" 159 if not s[end + 5:end + 7] == '\\u': 160 raise JSONDecodeError(msg, s, end) 161 esc2 = s[end + 7:end + 11] 162 if len(esc2) != 4: 163 raise JSONDecodeError(msg, s, end) 164 uni2 = int(esc2, 16) 165 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) 166 next_end += 6 167 char = unichr(uni) 168 end = next_end 169 # Append the unescaped character 170 _append(char) 171 return u''.join(chunks), end 172 173 174 # Use speedup if available 175 scanstring = c_scanstring or py_scanstring 176 177 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) 178 WHITESPACE_STR = ' \t\n\r' 179 180 def JSONObject((s, end), encoding, strict, scan_once, object_hook, 181 object_pairs_hook, memo=None, 182 _w=WHITESPACE.match, _ws=WHITESPACE_STR): 183 # Backwards compatibility 184 if memo is None: 185 memo = {} 186 memo_get = memo.setdefault 187 pairs = [] 188 # Use a slice to prevent IndexError from being raised, the following 189 # check will raise a more specific ValueError if the string is empty 190 nextchar = s[end:end + 1] 191 # Normally we expect nextchar == '"' 192 if nextchar != '"': 193 if nextchar in _ws: 194 end = _w(s, end).end() 195 nextchar = s[end:end + 1] 196 # Trivial empty object 197 if nextchar == '}': 198 if object_pairs_hook is not None: 199 result = object_pairs_hook(pairs) 200 return result, end + 1 201 pairs = {} 202 if object_hook is not None: 203 pairs = object_hook(pairs) 204 return pairs, end + 1 205 elif nextchar != '"': 206 raise JSONDecodeError( 207 "Expecting property name enclosed in double quotes", 208 s, end) 209 end += 1 210 while True: 211 key, end = scanstring(s, end, encoding, strict) 212 key = memo_get(key, key) 213 214 # To skip some function call overhead we optimize the fast paths where 215 # the JSON key separator is ": " or just ":". 216 if s[end:end + 1] != ':': 217 end = _w(s, end).end() 218 if s[end:end + 1] != ':': 219 raise JSONDecodeError("Expecting ':' delimiter", s, end) 220 221 end += 1 222 223 try: 224 if s[end] in _ws: 225 end += 1 226 if s[end] in _ws: 227 end = _w(s, end + 1).end() 228 except IndexError: 229 pass 230 231 try: 232 value, end = scan_once(s, end) 233 except StopIteration: 234 raise JSONDecodeError("Expecting object", s, end) 235 pairs.append((key, value)) 236 237 try: 238 nextchar = s[end] 239 if nextchar in _ws: 240 end = _w(s, end + 1).end() 241 nextchar = s[end] 242 except IndexError: 243 nextchar = '' 244 end += 1 245 246 if nextchar == '}': 247 break 248 elif nextchar != ',': 249 raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) 250 251 try: 252 nextchar = s[end] 253 if nextchar in _ws: 254 end += 1 255 nextchar = s[end] 256 if nextchar in _ws: 257 end = _w(s, end + 1).end() 258 nextchar = s[end] 259 except IndexError: 260 nextchar = '' 261 262 end += 1 263 if nextchar != '"': 264 raise JSONDecodeError( 265 "Expecting property name enclosed in double quotes", 266 s, end - 1) 267 268 if object_pairs_hook is not None: 269 result = object_pairs_hook(pairs) 270 return result, end 271 pairs = dict(pairs) 272 if object_hook is not None: 273 pairs = object_hook(pairs) 274 return pairs, end 275 276 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): 277 values = [] 278 nextchar = s[end:end + 1] 279 if nextchar in _ws: 280 end = _w(s, end + 1).end() 281 nextchar = s[end:end + 1] 282 # Look-ahead for trivial empty array 283 if nextchar == ']': 284 return values, end + 1 285 _append = values.append 286 while True: 287 try: 288 value, end = scan_once(s, end) 289 except StopIteration: 290 raise JSONDecodeError("Expecting object", s, end) 291 _append(value) 292 nextchar = s[end:end + 1] 293 if nextchar in _ws: 294 end = _w(s, end + 1).end() 295 nextchar = s[end:end + 1] 296 end += 1 297 if nextchar == ']': 298 break 299 elif nextchar != ',': 300 raise JSONDecodeError("Expecting ',' delimiter", s, end) 301 302 try: 303 if s[end] in _ws: 304 end += 1 305 if s[end] in _ws: 306 end = _w(s, end + 1).end() 307 except IndexError: 308 pass 309 310 return values, end 311 312 class JSONDecoder(object): 313 """Simple JSON <http://json.org> decoder 314 315 Performs the following translations in decoding by default: 316 317 +---------------+-------------------+ 318 | JSON | Python | 319 +===============+===================+ 320 | object | dict | 321 +---------------+-------------------+ 322 | array | list | 323 +---------------+-------------------+ 324 | string | unicode | 325 +---------------+-------------------+ 326 | number (int) | int, long | 327 +---------------+-------------------+ 328 | number (real) | float | 329 +---------------+-------------------+ 330 | true | True | 331 +---------------+-------------------+ 332 | false | False | 333 +---------------+-------------------+ 334 | null | None | 335 +---------------+-------------------+ 336 337 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as 338 their corresponding ``float`` values, which is outside the JSON spec. 339 340 """ 341 342 def __init__(self, encoding=None, object_hook=None, parse_float=None, 343 parse_int=None, parse_constant=None, strict=True, 344 object_pairs_hook=None): 345 """ 346 *encoding* determines the encoding used to interpret any 347 :class:`str` objects decoded by this instance (``'utf-8'`` by 348 default). It has no effect when decoding :class:`unicode` objects. 349 350 Note that currently only encodings that are a superset of ASCII work, 351 strings of other encodings should be passed in as :class:`unicode`. 352 353 *object_hook*, if specified, will be called with the result of every 354 JSON object decoded and its return value will be used in place of the 355 given :class:`dict`. This can be used to provide custom 356 deserializations (e.g. to support JSON-RPC class hinting). 357 358 *object_pairs_hook* is an optional function that will be called with 359 the result of any object literal decode with an ordered list of pairs. 360 The return value of *object_pairs_hook* will be used instead of the 361 :class:`dict`. This feature can be used to implement custom decoders 362 that rely on the order that the key and value pairs are decoded (for 363 example, :func:`collections.OrderedDict` will remember the order of 364 insertion). If *object_hook* is also defined, the *object_pairs_hook* 365 takes priority. 366 367 *parse_float*, if specified, will be called with the string of every 368 JSON float to be decoded. By default, this is equivalent to 369 ``float(num_str)``. This can be used to use another datatype or parser 370 for JSON floats (e.g. :class:`decimal.Decimal`). 371 372 *parse_int*, if specified, will be called with the string of every 373 JSON int to be decoded. By default, this is equivalent to 374 ``int(num_str)``. This can be used to use another datatype or parser 375 for JSON integers (e.g. :class:`float`). 376 377 *parse_constant*, if specified, will be called with one of the 378 following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This 379 can be used to raise an exception if invalid JSON numbers are 380 encountered. 381 382 *strict* controls the parser's behavior when it encounters an 383 invalid control character in a string. The default setting of 384 ``True`` means that unescaped control characters are parse errors, if 385 ``False`` then control characters will be allowed in strings. 386 387 """ 388 self.encoding = encoding 389 self.object_hook = object_hook 390 self.object_pairs_hook = object_pairs_hook 391 self.parse_float = parse_float or float 392 self.parse_int = parse_int or int 393 self.parse_constant = parse_constant or _CONSTANTS.__getitem__ 394 self.strict = strict 395 self.parse_object = JSONObject 396 self.parse_array = JSONArray 397 self.parse_string = scanstring 398 self.memo = {} 399 self.scan_once = make_scanner(self) 400 401 def decode(self, s, _w=WHITESPACE.match): 402 """Return the Python representation of ``s`` (a ``str`` or ``unicode`` 403 instance containing a JSON document) 404 405 """ 406 obj, end = self.raw_decode(s) 407 end = _w(s, end).end() 408 if end != len(s): 409 raise JSONDecodeError("Extra data", s, end, len(s)) 410 return obj 411 412 def raw_decode(self, s, idx=0, _w=WHITESPACE.match): 413 """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` 414 beginning with a JSON document) and return a 2-tuple of the Python 415 representation and the index in ``s`` where the document ended. 416 Optionally, ``idx`` can be used to specify an offset in ``s`` where 417 the JSON document begins. 418 419 This can be used to decode a JSON document from a string that may 420 have extraneous data at the end. 421 422 """ 423 try: 424 obj, end = self.scan_once(s, idx=_w(s, idx).end()) 425 except StopIteration: 426 raise JSONDecodeError("No JSON object could be decoded", s, idx) 427 return obj, end 428