"""Implementation of JSONDecoder """ import re import sys import struct from simplejson.scanner import make_scanner def _import_c_scanstring(): try: from simplejson._speedups import scanstring return scanstring except ImportError: return None c_scanstring = _import_c_scanstring() __all__ = ['JSONDecoder'] FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL def _floatconstants(): _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') # The struct module in Python 2.4 would get frexp() out of range here # when an endian is specified in the format string. Fixed in Python 2.5+ if sys.byteorder != 'big': _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] nan, inf = struct.unpack('dd', _BYTES) return nan, inf, -inf NaN, PosInf, NegInf = _floatconstants() class JSONDecodeError(ValueError): """Subclass of ValueError with the following additional properties: msg: The unformatted error message doc: The JSON document being parsed pos: The start index of doc where parsing failed end: The end index of doc where parsing failed (may be None) lineno: The line corresponding to pos colno: The column corresponding to pos endlineno: The line corresponding to end (may be None) endcolno: The column corresponding to end (may be None) """ def __init__(self, msg, doc, pos, end=None): ValueError.__init__(self, errmsg(msg, doc, pos, end=end)) self.msg = msg self.doc = doc self.pos = pos self.end = end self.lineno, self.colno = linecol(doc, pos) if end is not None: self.endlineno, self.endcolno = linecol(doc, end) else: self.endlineno, self.endcolno = None, None def linecol(doc, pos): lineno = doc.count('\n', 0, pos) + 1 if lineno == 1: colno = pos else: colno = pos - doc.rindex('\n', 0, pos) return lineno, colno def errmsg(msg, doc, pos, end=None): # Note that this function is called from _speedups lineno, colno = linecol(doc, pos) if end is None: #fmt = '{0}: line {1} column {2} (char {3})' #return fmt.format(msg, lineno, colno, pos) fmt = '%s: line %d column %d (char %d)' return fmt % (msg, lineno, colno, pos) endlineno, endcolno = linecol(doc, end) #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) _CONSTANTS = { '-Infinity': NegInf, 'Infinity': PosInf, 'NaN': NaN, } STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) BACKSLASH = { '"': u'"', '\\': u'\\', '/': u'/', 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', } DEFAULT_ENCODING = "utf-8" def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError( "Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if not isinstance(content, unicode): content = unicode(content, encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: msg = "Invalid control character %r at" % (terminator,) #msg = "Invalid control character {0!r} at".format(terminator) raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError( "Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\escape: " + repr(esc) raise JSONDecodeError(msg, s, end) end += 1 else: # Unicode escape sequence esc = s[end + 1:end + 5] next_end = end + 5 if len(esc) != 4: msg = "Invalid \\uXXXX escape" raise JSONDecodeError(msg, s, end) uni = int(esc, 16) # Check for surrogate pair on UCS-4 systems if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: msg = "Invalid \\uXXXX\\uXXXX surrogate pair" if not s[end + 5:end + 7] == '\\u': raise JSONDecodeError(msg, s, end) esc2 = s[end + 7:end + 11] if len(esc2) != 4: raise JSONDecodeError(msg, s, end) uni2 = int(esc2, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) next_end += 6 char = unichr(uni) end = next_end # Append the unescaped character _append(char) return u''.join(chunks), end # Use speedup if available scanstring = c_scanstring or py_scanstring WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) WHITESPACE_STR = ' \t\n\r' def JSONObject((s, end), encoding, strict, scan_once, object_hook, object_pairs_hook, memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): # Backwards compatibility if memo is None: memo = {} memo_get = memo.setdefault pairs = [] # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty nextchar = s[end:end + 1] # Normally we expect nextchar == '"' if nextchar != '"': if nextchar in _ws: end = _w(s, end).end() nextchar = s[end:end + 1] # Trivial empty object if nextchar == '}': if object_pairs_hook is not None: result = object_pairs_hook(pairs) return result, end + 1 pairs = {} if object_hook is not None: pairs = object_hook(pairs) return pairs, end + 1 elif nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end) end += 1 while True: key, end = scanstring(s, end, encoding, strict) key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': end = _w(s, end).end() if s[end:end + 1] != ':': raise JSONDecodeError("Expecting ':' delimiter", s, end) end += 1 try: if s[end] in _ws: end += 1 if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass try: value, end = scan_once(s, end) except StopIteration: raise JSONDecodeError("Expecting object", s, end) pairs.append((key, value)) try: nextchar = s[end] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end] except IndexError: nextchar = '' end += 1 if nextchar == '}': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) try: nextchar = s[end] if nextchar in _ws: end += 1 nextchar = s[end] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end] except IndexError: nextchar = '' end += 1 if nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end - 1) if object_pairs_hook is not None: result = object_pairs_hook(pairs) return result, end pairs = dict(pairs) if object_hook is not None: pairs = object_hook(pairs) return pairs, end def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): values = [] nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] # Look-ahead for trivial empty array if nextchar == ']': return values, end + 1 _append = values.append while True: try: value, end = scan_once(s, end) except StopIteration: raise JSONDecodeError("Expecting object", s, end) _append(value) nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] end += 1 if nextchar == ']': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end) try: if s[end] in _ws: end += 1 if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass return values, end class JSONDecoder(object): """Simple JSON <http://json.org> decoder Performs the following translations in decoding by default: +---------------+-------------------+ | JSON | Python | +===============+===================+ | object | dict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | unicode | +---------------+-------------------+ | number (int) | int, long | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+ It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. """ def __init__(self, encoding=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, object_pairs_hook=None): """ *encoding* determines the encoding used to interpret any :class:`str` objects decoded by this instance (``'utf-8'`` by default). It has no effect when decoding :class:`unicode` objects. Note that currently only encodings that are a superset of ASCII work, strings of other encodings should be passed in as :class:`unicode`. *object_hook*, if specified, will be called with the result of every JSON object decoded and its return value will be used in place of the given :class:`dict`. This can be used to provide custom deserializations (e.g. to support JSON-RPC class hinting). *object_pairs_hook* is an optional function that will be called with the result of any object literal decode with an ordered list of pairs. The return value of *object_pairs_hook* will be used instead of the :class:`dict`. This feature can be used to implement custom decoders that rely on the order that the key and value pairs are decoded (for example, :func:`collections.OrderedDict` will remember the order of insertion). If *object_hook* is also defined, the *object_pairs_hook* takes priority. *parse_float*, if specified, will be called with the string of every JSON float to be decoded. By default, this is equivalent to ``float(num_str)``. This can be used to use another datatype or parser for JSON floats (e.g. :class:`decimal.Decimal`). *parse_int*, if specified, will be called with the string of every JSON int to be decoded. By default, this is equivalent to ``int(num_str)``. This can be used to use another datatype or parser for JSON integers (e.g. :class:`float`). *parse_constant*, if specified, will be called with one of the following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This can be used to raise an exception if invalid JSON numbers are encountered. *strict* controls the parser's behavior when it encounters an invalid control character in a string. The default setting of ``True`` means that unescaped control characters are parse errors, if ``False`` then control characters will be allowed in strings. """ self.encoding = encoding self.object_hook = object_hook self.object_pairs_hook = object_pairs_hook self.parse_float = parse_float or float self.parse_int = parse_int or int self.parse_constant = parse_constant or _CONSTANTS.__getitem__ self.strict = strict self.parse_object = JSONObject self.parse_array = JSONArray self.parse_string = scanstring self.memo = {} self.scan_once = make_scanner(self) def decode(self, s, _w=WHITESPACE.match): """Return the Python representation of ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) """ obj, end = self.raw_decode(s) end = _w(s, end).end() if end != len(s): raise JSONDecodeError("Extra data", s, end, len(s)) return obj def raw_decode(self, s, idx=0, _w=WHITESPACE.match): """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning with a JSON document) and return a 2-tuple of the Python representation and the index in ``s`` where the document ended. Optionally, ``idx`` can be used to specify an offset in ``s`` where the JSON document begins. This can be used to decode a JSON document from a string that may have extraneous data at the end. """ try: obj, end = self.scan_once(s, idx=_w(s, idx).end()) except StopIteration: raise JSONDecodeError("No JSON object could be decoded", s, idx) return obj, end