Home | History | Annotate | Download | only in json
      1 """Implementation of JSONEncoder
      2 """
      3 import re
      4 
      5 try:
      6     from _json import encode_basestring_ascii as c_encode_basestring_ascii
      7 except ImportError:
      8     c_encode_basestring_ascii = None
      9 try:
     10     from _json import make_encoder as c_make_encoder
     11 except ImportError:
     12     c_make_encoder = None
     13 
     14 ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
     15 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
     16 HAS_UTF8 = re.compile(r'[\x80-\xff]')
     17 ESCAPE_DCT = {
     18     '\\': '\\\\',
     19     '"': '\\"',
     20     '\b': '\\b',
     21     '\f': '\\f',
     22     '\n': '\\n',
     23     '\r': '\\r',
     24     '\t': '\\t',
     25 }
     26 for i in range(0x20):
     27     ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
     28     #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
     29 
     30 INFINITY = float('inf')
     31 FLOAT_REPR = float.__repr__
     32 
     33 def encode_basestring(s):
     34     """Return a JSON representation of a Python string
     35 
     36     """
     37     def replace(match):
     38         return ESCAPE_DCT[match.group(0)]
     39     return '"' + ESCAPE.sub(replace, s) + '"'
     40 
     41 
     42 def py_encode_basestring_ascii(s):
     43     """Return an ASCII-only JSON representation of a Python string
     44 
     45     """
     46     if isinstance(s, str) and HAS_UTF8.search(s) is not None:
     47         s = s.decode('utf-8')
     48     def replace(match):
     49         s = match.group(0)
     50         try:
     51             return ESCAPE_DCT[s]
     52         except KeyError:
     53             n = ord(s)
     54             if n < 0x10000:
     55                 return '\\u{0:04x}'.format(n)
     56                 #return '\\u%04x' % (n,)
     57             else:
     58                 # surrogate pair
     59                 n -= 0x10000
     60                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
     61                 s2 = 0xdc00 | (n & 0x3ff)
     62                 return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
     63                 #return '\\u%04x\\u%04x' % (s1, s2)
     64     return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
     65 
     66 
     67 encode_basestring_ascii = (
     68     c_encode_basestring_ascii or py_encode_basestring_ascii)
     69 
     70 class JSONEncoder(object):
     71     """Extensible JSON <http://json.org> encoder for Python data structures.
     72 
     73     Supports the following objects and types by default:
     74 
     75     +-------------------+---------------+
     76     | Python            | JSON          |
     77     +===================+===============+
     78     | dict              | object        |
     79     +-------------------+---------------+
     80     | list, tuple       | array         |
     81     +-------------------+---------------+
     82     | str, unicode      | string        |
     83     +-------------------+---------------+
     84     | int, long, float  | number        |
     85     +-------------------+---------------+
     86     | True              | true          |
     87     +-------------------+---------------+
     88     | False             | false         |
     89     +-------------------+---------------+
     90     | None              | null          |
     91     +-------------------+---------------+
     92 
     93     To extend this to recognize other objects, subclass and implement a
     94     ``.default()`` method with another method that returns a serializable
     95     object for ``o`` if possible, otherwise it should call the superclass
     96     implementation (to raise ``TypeError``).
     97 
     98     """
     99     item_separator = ', '
    100     key_separator = ': '
    101     def __init__(self, skipkeys=False, ensure_ascii=True,
    102             check_circular=True, allow_nan=True, sort_keys=False,
    103             indent=None, separators=None, encoding='utf-8', default=None):
    104         """Constructor for JSONEncoder, with sensible defaults.
    105 
    106         If skipkeys is false, then it is a TypeError to attempt
    107         encoding of keys that are not str, int, long, float or None.  If
    108         skipkeys is True, such items are simply skipped.
    109 
    110         If *ensure_ascii* is true (the default), all non-ASCII
    111         characters in the output are escaped with \uXXXX sequences,
    112         and the results are str instances consisting of ASCII
    113         characters only.  If ensure_ascii is False, a result may be a
    114         unicode instance.  This usually happens if the input contains
    115         unicode strings or the *encoding* parameter is used.
    116 
    117         If check_circular is true, then lists, dicts, and custom encoded
    118         objects will be checked for circular references during encoding to
    119         prevent an infinite recursion (which would cause an OverflowError).
    120         Otherwise, no such check takes place.
    121 
    122         If allow_nan is true, then NaN, Infinity, and -Infinity will be
    123         encoded as such.  This behavior is not JSON specification compliant,
    124         but is consistent with most JavaScript based encoders and decoders.
    125         Otherwise, it will be a ValueError to encode such floats.
    126 
    127         If sort_keys is true, then the output of dictionaries will be
    128         sorted by key; this is useful for regression tests to ensure
    129         that JSON serializations can be compared on a day-to-day basis.
    130 
    131         If indent is a non-negative integer, then JSON array
    132         elements and object members will be pretty-printed with that
    133         indent level.  An indent level of 0 will only insert newlines.
    134         None is the most compact representation.  Since the default
    135         item separator is ', ',  the output might include trailing
    136         whitespace when indent is specified.  You can use
    137         separators=(',', ': ') to avoid this.
    138 
    139         If specified, separators should be a (item_separator, key_separator)
    140         tuple.  The default is (', ', ': ').  To get the most compact JSON
    141         representation you should specify (',', ':') to eliminate whitespace.
    142 
    143         If specified, default is a function that gets called for objects
    144         that can't otherwise be serialized.  It should return a JSON encodable
    145         version of the object or raise a ``TypeError``.
    146 
    147         If encoding is not None, then all input strings will be
    148         transformed into unicode using that encoding prior to JSON-encoding.
    149         The default is UTF-8.
    150 
    151         """
    152 
    153         self.skipkeys = skipkeys
    154         self.ensure_ascii = ensure_ascii
    155         self.check_circular = check_circular
    156         self.allow_nan = allow_nan
    157         self.sort_keys = sort_keys
    158         self.indent = indent
    159         if separators is not None:
    160             self.item_separator, self.key_separator = separators
    161         if default is not None:
    162             self.default = default
    163         self.encoding = encoding
    164 
    165     def default(self, o):
    166         """Implement this method in a subclass such that it returns
    167         a serializable object for ``o``, or calls the base implementation
    168         (to raise a ``TypeError``).
    169 
    170         For example, to support arbitrary iterators, you could
    171         implement default like this::
    172 
    173             def default(self, o):
    174                 try:
    175                     iterable = iter(o)
    176                 except TypeError:
    177                     pass
    178                 else:
    179                     return list(iterable)
    180                 # Let the base class default method raise the TypeError
    181                 return JSONEncoder.default(self, o)
    182 
    183         """
    184         raise TypeError(repr(o) + " is not JSON serializable")
    185 
    186     def encode(self, o):
    187         """Return a JSON string representation of a Python data structure.
    188 
    189         >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
    190         '{"foo": ["bar", "baz"]}'
    191 
    192         """
    193         # This is for extremely simple cases and benchmarks.
    194         if isinstance(o, basestring):
    195             if isinstance(o, str):
    196                 _encoding = self.encoding
    197                 if (_encoding is not None
    198                         and not (_encoding == 'utf-8')):
    199                     o = o.decode(_encoding)
    200             if self.ensure_ascii:
    201                 return encode_basestring_ascii(o)
    202             else:
    203                 return encode_basestring(o)
    204         # This doesn't pass the iterator directly to ''.join() because the
    205         # exceptions aren't as detailed.  The list call should be roughly
    206         # equivalent to the PySequence_Fast that ''.join() would do.
    207         chunks = self.iterencode(o, _one_shot=True)
    208         if not isinstance(chunks, (list, tuple)):
    209             chunks = list(chunks)
    210         return ''.join(chunks)
    211 
    212     def iterencode(self, o, _one_shot=False):
    213         """Encode the given object and yield each string
    214         representation as available.
    215 
    216         For example::
    217 
    218             for chunk in JSONEncoder().iterencode(bigobject):
    219                 mysocket.write(chunk)
    220 
    221         """
    222         if self.check_circular:
    223             markers = {}
    224         else:
    225             markers = None
    226         if self.ensure_ascii:
    227             _encoder = encode_basestring_ascii
    228         else:
    229             _encoder = encode_basestring
    230         if self.encoding != 'utf-8':
    231             def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
    232                 if isinstance(o, str):
    233                     o = o.decode(_encoding)
    234                 return _orig_encoder(o)
    235 
    236         def floatstr(o, allow_nan=self.allow_nan,
    237                 _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
    238             # Check for specials.  Note that this type of test is processor
    239             # and/or platform-specific, so do tests which don't depend on the
    240             # internals.
    241 
    242             if o != o:
    243                 text = 'NaN'
    244             elif o == _inf:
    245                 text = 'Infinity'
    246             elif o == _neginf:
    247                 text = '-Infinity'
    248             else:
    249                 return _repr(o)
    250 
    251             if not allow_nan:
    252                 raise ValueError(
    253                     "Out of range float values are not JSON compliant: " +
    254                     repr(o))
    255 
    256             return text
    257 
    258 
    259         if (_one_shot and c_make_encoder is not None
    260                 and self.indent is None and not self.sort_keys):
    261             _iterencode = c_make_encoder(
    262                 markers, self.default, _encoder, self.indent,
    263                 self.key_separator, self.item_separator, self.sort_keys,
    264                 self.skipkeys, self.allow_nan)
    265         else:
    266             _iterencode = _make_iterencode(
    267                 markers, self.default, _encoder, self.indent, floatstr,
    268                 self.key_separator, self.item_separator, self.sort_keys,
    269                 self.skipkeys, _one_shot)
    270         return _iterencode(o, 0)
    271 
    272 def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
    273         _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
    274         ## HACK: hand-optimized bytecode; turn globals into locals
    275         ValueError=ValueError,
    276         basestring=basestring,
    277         dict=dict,
    278         float=float,
    279         id=id,
    280         int=int,
    281         isinstance=isinstance,
    282         list=list,
    283         long=long,
    284         str=str,
    285         tuple=tuple,
    286     ):
    287 
    288     def _iterencode_list(lst, _current_indent_level):
    289         if not lst:
    290             yield '[]'
    291             return
    292         if markers is not None:
    293             markerid = id(lst)
    294             if markerid in markers:
    295                 raise ValueError("Circular reference detected")
    296             markers[markerid] = lst
    297         buf = '['
    298         if _indent is not None:
    299             _current_indent_level += 1
    300             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
    301             separator = _item_separator + newline_indent
    302             buf += newline_indent
    303         else:
    304             newline_indent = None
    305             separator = _item_separator
    306         first = True
    307         for value in lst:
    308             if first:
    309                 first = False
    310             else:
    311                 buf = separator
    312             if isinstance(value, basestring):
    313                 yield buf + _encoder(value)
    314             elif value is None:
    315                 yield buf + 'null'
    316             elif value is True:
    317                 yield buf + 'true'
    318             elif value is False:
    319                 yield buf + 'false'
    320             elif isinstance(value, (int, long)):
    321                 yield buf + str(value)
    322             elif isinstance(value, float):
    323                 yield buf + _floatstr(value)
    324             else:
    325                 yield buf
    326                 if isinstance(value, (list, tuple)):
    327                     chunks = _iterencode_list(value, _current_indent_level)
    328                 elif isinstance(value, dict):
    329                     chunks = _iterencode_dict(value, _current_indent_level)
    330                 else:
    331                     chunks = _iterencode(value, _current_indent_level)
    332                 for chunk in chunks:
    333                     yield chunk
    334         if newline_indent is not None:
    335             _current_indent_level -= 1
    336             yield '\n' + (' ' * (_indent * _current_indent_level))
    337         yield ']'
    338         if markers is not None:
    339             del markers[markerid]
    340 
    341     def _iterencode_dict(dct, _current_indent_level):
    342         if not dct:
    343             yield '{}'
    344             return
    345         if markers is not None:
    346             markerid = id(dct)
    347             if markerid in markers:
    348                 raise ValueError("Circular reference detected")
    349             markers[markerid] = dct
    350         yield '{'
    351         if _indent is not None:
    352             _current_indent_level += 1
    353             newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
    354             item_separator = _item_separator + newline_indent
    355             yield newline_indent
    356         else:
    357             newline_indent = None
    358             item_separator = _item_separator
    359         first = True
    360         if _sort_keys:
    361             items = sorted(dct.items(), key=lambda kv: kv[0])
    362         else:
    363             items = dct.iteritems()
    364         for key, value in items:
    365             if isinstance(key, basestring):
    366                 pass
    367             # JavaScript is weakly typed for these, so it makes sense to
    368             # also allow them.  Many encoders seem to do something like this.
    369             elif isinstance(key, float):
    370                 key = _floatstr(key)
    371             elif key is True:
    372                 key = 'true'
    373             elif key is False:
    374                 key = 'false'
    375             elif key is None:
    376                 key = 'null'
    377             elif isinstance(key, (int, long)):
    378                 key = str(key)
    379             elif _skipkeys:
    380                 continue
    381             else:
    382                 raise TypeError("key " + repr(key) + " is not a string")
    383             if first:
    384                 first = False
    385             else:
    386                 yield item_separator
    387             yield _encoder(key)
    388             yield _key_separator
    389             if isinstance(value, basestring):
    390                 yield _encoder(value)
    391             elif value is None:
    392                 yield 'null'
    393             elif value is True:
    394                 yield 'true'
    395             elif value is False:
    396                 yield 'false'
    397             elif isinstance(value, (int, long)):
    398                 yield str(value)
    399             elif isinstance(value, float):
    400                 yield _floatstr(value)
    401             else:
    402                 if isinstance(value, (list, tuple)):
    403                     chunks = _iterencode_list(value, _current_indent_level)
    404                 elif isinstance(value, dict):
    405                     chunks = _iterencode_dict(value, _current_indent_level)
    406                 else:
    407                     chunks = _iterencode(value, _current_indent_level)
    408                 for chunk in chunks:
    409                     yield chunk
    410         if newline_indent is not None:
    411             _current_indent_level -= 1
    412             yield '\n' + (' ' * (_indent * _current_indent_level))
    413         yield '}'
    414         if markers is not None:
    415             del markers[markerid]
    416 
    417     def _iterencode(o, _current_indent_level):
    418         if isinstance(o, basestring):
    419             yield _encoder(o)
    420         elif o is None:
    421             yield 'null'
    422         elif o is True:
    423             yield 'true'
    424         elif o is False:
    425             yield 'false'
    426         elif isinstance(o, (int, long)):
    427             yield str(o)
    428         elif isinstance(o, float):
    429             yield _floatstr(o)
    430         elif isinstance(o, (list, tuple)):
    431             for chunk in _iterencode_list(o, _current_indent_level):
    432                 yield chunk
    433         elif isinstance(o, dict):
    434             for chunk in _iterencode_dict(o, _current_indent_level):
    435                 yield chunk
    436         else:
    437             if markers is not None:
    438                 markerid = id(o)
    439                 if markerid in markers:
    440                     raise ValueError("Circular reference detected")
    441                 markers[markerid] = o
    442             o = _default(o)
    443             for chunk in _iterencode(o, _current_indent_level):
    444                 yield chunk
    445             if markers is not None:
    446                 del markers[markerid]
    447 
    448     return _iterencode
    449