decoder.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. """
  2. Implementation of JSONDecoder
  3. """
  4. import re
  5. import sys
  6. from simplejson.scanner import Scanner, pattern
  7. try:
  8. from simplejson._speedups import scanstring as c_scanstring
  9. except ImportError:
  10. pass
  11. FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
  12. def _floatconstants():
  13. import struct
  14. import sys
  15. _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
  16. if sys.byteorder != 'big':
  17. _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
  18. nan, inf = struct.unpack('dd', _BYTES)
  19. return nan, inf, -inf
  20. NaN, PosInf, NegInf = _floatconstants()
  21. def linecol(doc, pos):
  22. lineno = doc.count('\n', 0, pos) + 1
  23. if lineno == 1:
  24. colno = pos
  25. else:
  26. colno = pos - doc.rindex('\n', 0, pos)
  27. return lineno, colno
  28. def errmsg(msg, doc, pos, end=None):
  29. lineno, colno = linecol(doc, pos)
  30. if end is None:
  31. return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
  32. endlineno, endcolno = linecol(doc, end)
  33. return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
  34. msg, lineno, colno, endlineno, endcolno, pos, end)
  35. _CONSTANTS = {
  36. '-Infinity': NegInf,
  37. 'Infinity': PosInf,
  38. 'NaN': NaN,
  39. 'true': True,
  40. 'false': False,
  41. 'null': None,
  42. }
  43. def JSONConstant(match, context, c=_CONSTANTS):
  44. s = match.group(0)
  45. fn = getattr(context, 'parse_constant', None)
  46. if fn is None:
  47. rval = c[s]
  48. else:
  49. rval = fn(s)
  50. return rval, None
  51. pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant)
  52. def JSONNumber(match, context):
  53. match = JSONNumber.regex.match(match.string, *match.span())
  54. integer, frac, exp = match.groups()
  55. if frac or exp:
  56. fn = getattr(context, 'parse_float', None) or float
  57. res = fn(integer + (frac or '') + (exp or ''))
  58. else:
  59. fn = getattr(context, 'parse_int', None) or int
  60. res = fn(integer)
  61. return res, None
  62. pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)
  63. STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
  64. BACKSLASH = {
  65. '"': u'"', '\\': u'\\', '/': u'/',
  66. 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
  67. }
  68. DEFAULT_ENCODING = "utf-8"
  69. def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
  70. if encoding is None:
  71. encoding = DEFAULT_ENCODING
  72. chunks = []
  73. _append = chunks.append
  74. begin = end - 1
  75. while 1:
  76. chunk = _m(s, end)
  77. if chunk is None:
  78. raise ValueError(
  79. errmsg("Unterminated string starting at", s, begin))
  80. end = chunk.end()
  81. content, terminator = chunk.groups()
  82. if content:
  83. if not isinstance(content, unicode):
  84. content = unicode(content, encoding)
  85. _append(content)
  86. if terminator == '"':
  87. break
  88. elif terminator != '\\':
  89. if strict:
  90. raise ValueError(errmsg("Invalid control character %r at", s, end))
  91. else:
  92. _append(terminator)
  93. continue
  94. try:
  95. esc = s[end]
  96. except IndexError:
  97. raise ValueError(
  98. errmsg("Unterminated string starting at", s, begin))
  99. if esc != 'u':
  100. try:
  101. m = _b[esc]
  102. except KeyError:
  103. raise ValueError(
  104. errmsg("Invalid \\escape: %r" % (esc,), s, end))
  105. end += 1
  106. else:
  107. esc = s[end + 1:end + 5]
  108. next_end = end + 5
  109. msg = "Invalid \\uXXXX escape"
  110. try:
  111. if len(esc) != 4:
  112. raise ValueError
  113. uni = int(esc, 16)
  114. if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
  115. msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
  116. if not s[end + 5:end + 7] == '\\u':
  117. raise ValueError
  118. esc2 = s[end + 7:end + 11]
  119. if len(esc2) != 4:
  120. raise ValueError
  121. uni2 = int(esc2, 16)
  122. uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
  123. next_end += 6
  124. m = unichr(uni)
  125. except ValueError:
  126. raise ValueError(errmsg(msg, s, end))
  127. end = next_end
  128. _append(m)
  129. return u''.join(chunks), end
  130. # Use speedup
  131. try:
  132. scanstring = c_scanstring
  133. except NameError:
  134. scanstring = py_scanstring
  135. def JSONString(match, context):
  136. encoding = getattr(context, 'encoding', None)
  137. strict = getattr(context, 'strict', True)
  138. return scanstring(match.string, match.end(), encoding, strict)
  139. pattern(r'"')(JSONString)
  140. WHITESPACE = re.compile(r'\s*', FLAGS)
  141. def JSONObject(match, context, _w=WHITESPACE.match):
  142. pairs = {}
  143. s = match.string
  144. end = _w(s, match.end()).end()
  145. nextchar = s[end:end + 1]
  146. # Trivial empty object
  147. if nextchar == '}':
  148. return pairs, end + 1
  149. if nextchar != '"':
  150. raise ValueError(errmsg("Expecting property name", s, end))
  151. end += 1
  152. encoding = getattr(context, 'encoding', None)
  153. strict = getattr(context, 'strict', True)
  154. iterscan = JSONScanner.iterscan
  155. while True:
  156. key, end = scanstring(s, end, encoding, strict)
  157. end = _w(s, end).end()
  158. if s[end:end + 1] != ':':
  159. raise ValueError(errmsg("Expecting : delimiter", s, end))
  160. end = _w(s, end + 1).end()
  161. try:
  162. value, end = iterscan(s, idx=end, context=context).next()
  163. except StopIteration:
  164. raise ValueError(errmsg("Expecting object", s, end))
  165. pairs[key] = value
  166. end = _w(s, end).end()
  167. nextchar = s[end:end + 1]
  168. end += 1
  169. if nextchar == '}':
  170. break
  171. if nextchar != ',':
  172. raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
  173. end = _w(s, end).end()
  174. nextchar = s[end:end + 1]
  175. end += 1
  176. if nextchar != '"':
  177. raise ValueError(errmsg("Expecting property name", s, end - 1))
  178. object_hook = getattr(context, 'object_hook', None)
  179. if object_hook is not None:
  180. pairs = object_hook(pairs)
  181. return pairs, end
  182. pattern(r'{')(JSONObject)
  183. def JSONArray(match, context, _w=WHITESPACE.match):
  184. values = []
  185. s = match.string
  186. end = _w(s, match.end()).end()
  187. # Look-ahead for trivial empty array
  188. nextchar = s[end:end + 1]
  189. if nextchar == ']':
  190. return values, end + 1
  191. iterscan = JSONScanner.iterscan
  192. while True:
  193. try:
  194. value, end = iterscan(s, idx=end, context=context).next()
  195. except StopIteration:
  196. raise ValueError(errmsg("Expecting object", s, end))
  197. values.append(value)
  198. end = _w(s, end).end()
  199. nextchar = s[end:end + 1]
  200. end += 1
  201. if nextchar == ']':
  202. break
  203. if nextchar != ',':
  204. raise ValueError(errmsg("Expecting , delimiter", s, end))
  205. end = _w(s, end).end()
  206. return values, end
  207. pattern(r'\[')(JSONArray)
  208. ANYTHING = [
  209. JSONObject,
  210. JSONArray,
  211. JSONString,
  212. JSONConstant,
  213. JSONNumber,
  214. ]
  215. JSONScanner = Scanner(ANYTHING)
  216. class JSONDecoder(object):
  217. """
  218. Simple JSON <http://json.org> decoder
  219. Performs the following translations in decoding by default:
  220. +---------------+-------------------+
  221. | JSON | Python |
  222. +===============+===================+
  223. | object | dict |
  224. +---------------+-------------------+
  225. | array | list |
  226. +---------------+-------------------+
  227. | string | unicode |
  228. +---------------+-------------------+
  229. | number (int) | int, long |
  230. +---------------+-------------------+
  231. | number (real) | float |
  232. +---------------+-------------------+
  233. | true | True |
  234. +---------------+-------------------+
  235. | false | False |
  236. +---------------+-------------------+
  237. | null | None |
  238. +---------------+-------------------+
  239. It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
  240. their corresponding ``float`` values, which is outside the JSON spec.
  241. """
  242. _scanner = Scanner(ANYTHING)
  243. __all__ = ['__init__', 'decode', 'raw_decode']
  244. def __init__(self, encoding=None, object_hook=None, parse_float=None,
  245. parse_int=None, parse_constant=None, strict=True):
  246. """
  247. ``encoding`` determines the encoding used to interpret any ``str``
  248. objects decoded by this instance (utf-8 by default). It has no
  249. effect when decoding ``unicode`` objects.
  250. Note that currently only encodings that are a superset of ASCII work,
  251. strings of other encodings should be passed in as ``unicode``.
  252. ``object_hook``, if specified, will be called with the result
  253. of every JSON object decoded and its return value will be used in
  254. place of the given ``dict``. This can be used to provide custom
  255. deserializations (e.g. to support JSON-RPC class hinting).
  256. ``parse_float``, if specified, will be called with the string
  257. of every JSON float to be decoded. By default this is equivalent to
  258. float(num_str). This can be used to use another datatype or parser
  259. for JSON floats (e.g. decimal.Decimal).
  260. ``parse_int``, if specified, will be called with the string
  261. of every JSON int to be decoded. By default this is equivalent to
  262. int(num_str). This can be used to use another datatype or parser
  263. for JSON integers (e.g. float).
  264. ``parse_constant``, if specified, will be called with one of the
  265. following strings: -Infinity, Infinity, NaN, null, true, false.
  266. This can be used to raise an exception if invalid JSON numbers
  267. are encountered.
  268. """
  269. self.encoding = encoding
  270. self.object_hook = object_hook
  271. self.parse_float = parse_float
  272. self.parse_int = parse_int
  273. self.parse_constant = parse_constant
  274. self.strict = strict
  275. def decode(self, s, _w=WHITESPACE.match):
  276. """
  277. Return the Python representation of ``s`` (a ``str`` or ``unicode``
  278. instance containing a JSON document)
  279. """
  280. obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  281. end = _w(s, end).end()
  282. if end != len(s):
  283. raise ValueError(errmsg("Extra data", s, end, len(s)))
  284. return obj
  285. def raw_decode(self, s, **kw):
  286. """
  287. Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
  288. with a JSON document) and return a 2-tuple of the Python
  289. representation and the index in ``s`` where the document ended.
  290. This can be used to decode a JSON document from a string that may
  291. have extraneous data at the end.
  292. """
  293. kw.setdefault('context', self)
  294. try:
  295. obj, end = self._scanner.iterscan(s, **kw).next()
  296. except StopIteration:
  297. raise ValueError("No JSON object could be decoded")
  298. return obj, end
  299. __all__ = ['JSONDecoder']