[pypy-commit] pypy json-decoder-maps-py3.6: 3.6 version of json-decoder-maps
cfbolz
pypy.commits at gmail.com
Tue Jun 11 07:21:34 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: json-decoder-maps-py3.6
Changeset: r96790:5bea5a7b9381
Date: 2019-06-11 13:20 +0200
http://bitbucket.org/pypy/pypy/changeset/5bea5a7b9381/
Log: 3.6 version of json-decoder-maps
diff too long, truncating to 2000 out of 2070 lines
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -49,3 +49,9 @@
Instead, replace it in ``rewrite.py`` with a direct call to ``memcpy()`` and
new basic operation, ``load_effective_address``, which the backend can
even decide not to implement.
+
+.. branch: json-decoder-maps
+
+Much faster and more memory-efficient JSON decoding. The resulting
+dictionaries that come out of the JSON decoder have faster lookups too.
+
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,11 +1,13 @@
import sys
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize, always_inline, r_dict
-from rpython.rlib import rfloat, rutf8
+from rpython.rlib.objectmodel import specialize, always_inline
+from rpython.rlib import rfloat, runicode, jit, objectmodel, rutf8
from rpython.rtyper.lltypesystem import lltype, rffi
from pypy.interpreter.error import oefmt, OperationError
from rpython.rlib.rarithmetic import r_uint
from pypy.interpreter import unicodehelper
+from pypy.interpreter.baseobjspace import W_Root
+from pypy.module._pypyjson import simd
OVF_DIGITS = len(str(sys.maxint))
@@ -15,50 +17,97 @@
# precomputing negative powers of 10 is MUCH faster than using e.g. math.pow
# at runtime
NEG_POW_10 = [10.0**-i for i in range(16)]
+del i
+
def neg_pow_10(x, exp):
if exp >= len(NEG_POW_10):
return 0.0
return x * NEG_POW_10[exp]
-def slice_eq(a, b):
- (ll_chars1, start1, length1, _) = a
- (ll_chars2, start2, length2, _) = b
- if length1 != length2:
- return False
- j = start2
- for i in range(start1, start1 + length1):
- if ll_chars1[i] != ll_chars2[j]:
- return False
- j += 1
- return True
-def slice_hash(a):
- (ll_chars, start, length, h) = a
- return h
+class IntCache(object):
+ """ A cache for wrapped ints between START and END """
+
+ START = -10
+ END = 256
+
+ def __init__(self, space):
+ self.space = space
+ self.cache = [self.space.newint(i)
+ for i in range(self.START, self.END)]
+
+ def newint(self, intval):
+ if self.START <= intval < self.END:
+ return self.cache[intval - self.START]
+ return self.space.newint(intval)
class DecoderError(Exception):
def __init__(self, msg, pos):
self.msg = msg
self.pos = pos
-TYPE_UNKNOWN = 0
-TYPE_STRING = 1
-class JSONDecoder(object):
+class JSONDecoder(W_Root):
+
+ LRU_SIZE = 16
+ LRU_MASK = LRU_SIZE - 1
+
+ DEFAULT_SIZE_SCRATCH = 20
+
+ MIN_SIZE_FOR_STRING_CACHE = 1024 * 1024
+
+ # evaluate the string cache for 200 strings, before looking at the hit rate
+ # and deciding whether to keep doing it
+ STRING_CACHE_EVALUATION_SIZE = 200
+
+ # keep using the string cache if at least 25% of all decoded strings are a
+ # hit in the cache
+ STRING_CACHE_USEFULNESS_FACTOR = 4
+
+
def __init__(self, space, s):
self.space = space
+ self.w_empty_string = space.newutf8("", 0)
+
self.s = s
+
# we put our string in a raw buffer so:
# 1) we automatically get the '\0' sentinel at the end of the string,
# which means that we never have to check for the "end of string"
# 2) we can pass the buffer directly to strtod
- self.ll_chars = rffi.str2charp(s)
+ self.ll_chars, self.flag = rffi.get_nonmovingbuffer_final_null(self.s)
self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
self.pos = 0
- self.cache = r_dict(slice_eq, slice_hash, simple_hash_eq=True)
+ self.intcache = space.fromcache(IntCache)
+
+ # two caches, one for keys, one for general strings. they both have the
+ # form {hash-as-int: CacheEntry} and they don't deal with
+ # collisions at all. For every hash there is simply one string stored.
+ self.cache = {}
+ self.cache_wrapped = {}
+
+ self.lru_cache = [0] * self.LRU_SIZE
+ self.lru_index = 0
+
+ self.startmap = self.space.fromcache(Terminator)
+
+ # keep a list of objects that are created with maps that aren't clearly
+ # useful. If they turn out to be useful in the end we are good,
+ # otherwise convert them to dicts (see .close())
+ self.unclear_objects = []
+
+ # this is a freelist of lists that store the decoded value of an
+ # object, before they get copied into the eventual dict
+ self.scratch = [[None] * self.DEFAULT_SIZE_SCRATCH]
+
def close(self):
- rffi.free_charp(self.ll_chars)
+ rffi.free_nonmovingbuffer(self.s, self.ll_chars, self.flag)
lltype.free(self.end_ptr, flavor='raw')
+ # clean up objects that are instances of now blocked maps
+ for w_obj in self.unclear_objects:
+ jsonmap = self._get_jsonmap_from_dict(w_obj)
+ if jsonmap.is_state_blocked():
+ self._devolve_jsonmap_dict(w_obj)
def getslice(self, start, end):
assert start >= 0
@@ -66,19 +115,22 @@
return self.s[start:end]
def skip_whitespace(self, i):
+ ll_chars = self.ll_chars
while True:
- ch = self.ll_chars[i]
+ ch = ll_chars[i]
if is_whitespace(ch):
- i+=1
+ i += 1
else:
break
return i
- def decode_any(self, i):
+ def decode_any(self, i, contextmap=None):
+ """ Decode an object at position i. Optionally pass a contextmap, if
+ the value is decoded as the value of a dict. """
i = self.skip_whitespace(i)
ch = self.ll_chars[i]
if ch == '"':
- return self.decode_string(i+1)
+ return self.decode_string(i+1, contextmap)
elif ch == '[':
return self.decode_array(i+1)
elif ch == '{':
@@ -100,7 +152,11 @@
elif ch.isdigit():
return self.decode_numeric(i)
else:
- raise DecoderError("Unexpected '%s' at" % ch, i)
+ raise DecoderError("Unexpected '%s'" % ch, i)
+
+
+ def _raise(self, msg, pos):
+ raise DecoderError(msg, pos)
def decode_null(self, i):
if (self.ll_chars[i] == 'u' and
@@ -108,7 +164,7 @@
self.ll_chars[i+2] == 'l'):
self.pos = i+3
return self.space.w_None
- raise DecoderError("Error when decoding null at", i)
+ raise DecoderError("Error when decoding null", i)
def decode_true(self, i):
if (self.ll_chars[i] == 'r' and
@@ -116,7 +172,7 @@
self.ll_chars[i+2] == 'e'):
self.pos = i+3
return self.space.w_True
- raise DecoderError("Error when decoding true at", i)
+ raise DecoderError("Error when decoding true", i)
def decode_false(self, i):
if (self.ll_chars[i] == 'a' and
@@ -125,7 +181,7 @@
self.ll_chars[i+3] == 'e'):
self.pos = i+4
return self.space.w_False
- raise DecoderError("Error when decoding false at", i)
+ raise DecoderError("Error when decoding false", i)
def decode_infinity(self, i, sign=1):
if (self.ll_chars[i] == 'n' and
@@ -137,14 +193,14 @@
self.ll_chars[i+6] == 'y'):
self.pos = i+7
return self.space.newfloat(rfloat.INFINITY * sign)
- raise DecoderError("Error when decoding Infinity at", i)
+ raise DecoderError("Error when decoding Infinity", i)
def decode_nan(self, i):
if (self.ll_chars[i] == 'a' and
self.ll_chars[i+1] == 'N'):
self.pos = i+2
return self.space.newfloat(rfloat.NAN)
- raise DecoderError("Error when decoding NaN at", i)
+ raise DecoderError("Error when decoding NaN", i)
def decode_numeric(self, i):
start = i
@@ -154,7 +210,7 @@
ch = self.ll_chars[i]
if ch == '.':
if not self.ll_chars[i+1].isdigit():
- raise DecoderError("Expected digit at", i+1)
+ raise DecoderError("Expected digit", i+1)
return self.decode_float(start)
elif ch == 'e' or ch == 'E':
return self.decode_float(start)
@@ -162,7 +218,7 @@
return self.decode_int_slow(start)
self.pos = i
- return self.space.newint(intval)
+ return self.intcache.newint(intval)
def decode_float(self, i):
from rpython.rlib import rdtoa
@@ -208,13 +264,27 @@
break
count = i - start
if count == 0:
- raise DecoderError("Expected digit at", i)
+ raise DecoderError("Expected digit", i)
# if the number has more digits than OVF_DIGITS, it might have
# overflowed
ovf_maybe = (count >= OVF_DIGITS)
return i, ovf_maybe, sign * intval
+ def _raise_control_char_in_string(self, ch, startindex, currindex):
+ if ch == '\0':
+ self._raise("Unterminated string starting at",
+ startindex - 1)
+ else:
+ self._raise("Invalid control character at", currindex-1)
+
+ def _raise_object_error(self, ch, start, i):
+ if ch == '\0':
+ self._raise("Unterminated object starting at", start)
+ else:
+ self._raise("Unexpected '%s' when decoding object (char %d)" % ch, i)
+
def decode_array(self, i):
+ """ Decode a list. i must be after the opening '[' """
w_list = self.space.newlist([])
start = i
i = self.skip_whitespace(start)
@@ -248,63 +318,120 @@
self.pos = i+1
return self.space.newdict()
- d = self._create_empty_dict()
+ if self.scratch:
+ values_w = self.scratch.pop()
+ else:
+ values_w = [None] * self.DEFAULT_SIZE_SCRATCH
+ nextindex = 0
+ currmap = self.startmap
while True:
# parse a key: value
- w_name = self.decode_key(i)
+ currmap = self.decode_key_map(i, currmap)
i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
if ch != ':':
raise DecoderError("No ':' found at", i)
i += 1
- i = self.skip_whitespace(i)
- #
- w_value = self.decode_any(i)
- d[w_name] = w_value
+
+ w_value = self.decode_any(i, currmap)
+
+ if nextindex == len(values_w): # full
+ values_w = values_w + [None] * len(values_w) # double
+ values_w[nextindex] = w_value
+ nextindex += 1
i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
i += 1
if ch == '}':
self.pos = i
- return self._create_dict(d)
+ self.scratch.append(values_w) # can reuse next time
+ if currmap.is_state_blocked():
+ dict_w = self._switch_to_dict(currmap, values_w, nextindex)
+ return self._create_dict(dict_w)
+ values_w = values_w[:nextindex]
+ w_res = self._create_dict_map(values_w, currmap)
+ if not currmap.is_state_useful():
+ self.unclear_objects.append(w_res)
+ return w_res
elif ch == ',':
- pass
- elif ch == '\0':
- raise DecoderError("Unterminated object starting at", start)
+ i = self.skip_whitespace(i)
+ if currmap.is_state_blocked():
+ self.scratch.append(values_w) # can reuse next time
+ dict_w = self._switch_to_dict(currmap, values_w, nextindex)
+ return self.decode_object_dict(i, start, dict_w)
else:
- raise DecoderError("Unexpected '%s' when decoding object" % ch,
- i-1)
+ self._raise_object_error(ch, start, i - 1)
- def decode_string(self, i):
- start = i
- bits = 0
+ def _create_dict_map(self, values_w, jsonmap):
+ from pypy.objspace.std.jsondict import from_values_and_jsonmap
+ return from_values_and_jsonmap(self.space, values_w, jsonmap)
+
+ def _devolve_jsonmap_dict(self, w_dict):
+ from pypy.objspace.std.jsondict import devolve_jsonmap_dict
+ devolve_jsonmap_dict(w_dict)
+
+ def _get_jsonmap_from_dict(self, w_dict):
+ from pypy.objspace.std.jsondict import get_jsonmap_from_dict
+ return get_jsonmap_from_dict(w_dict)
+
+ def _switch_to_dict(self, currmap, values_w, nextindex):
+ dict_w = self._create_empty_dict()
+ index = nextindex - 1
+ while isinstance(currmap, JSONMap):
+ dict_w[currmap.w_key] = values_w[index]
+ index -= 1
+ currmap = currmap.prev
+ assert len(dict_w) == nextindex
+ return dict_w
+
+ def decode_object_dict(self, i, start, dict_w):
while True:
- # this loop is a fast path for strings which do not contain escape
- # characters
+ # parse a key: value
+ w_key = self.decode_key_string(i)
+ i = self.skip_whitespace(self.pos)
+ ch = self.ll_chars[i]
+ if ch != ':':
+ self._raise("No ':' found at", i)
+ i += 1
+
+ w_value = self.decode_any(i)
+ dict_w[w_key] = w_value
+ i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
i += 1
- bits |= ord(ch)
- if ch == '"':
+ if ch == '}':
self.pos = i
- return self._create_string(start, i - 1, bits)
- elif ch == '\\' or ch < '\x20':
- self.pos = i-1
- return self.decode_string_escaped(start)
+ return self._create_dict(dict_w)
+ elif ch == ',':
+ i = self.skip_whitespace(i)
+ else:
+ self._raise_object_error(ch, start, i - 1)
- def _create_string(self, start, end, bits):
- if bits & 0x80:
- # the 8th bit is set, it's an utf8 string
- content_utf8 = self.getslice(start, end)
+ def decode_string_uncached(self, i):
+ start = i
+ ll_chars = self.ll_chars
+ nonascii, i = simd.find_end_of_string_no_hash(ll_chars, i, len(self.s))
+ ch = ll_chars[i]
+ if ch == '\\':
+ self.pos = i
+ return self.decode_string_escaped(start, nonascii)
+ if ch < '\x20':
+ self._raise_control_char_in_string(ch, start, i)
+ else:
+ assert ch == '"'
+
+ self.pos = i + 1
+ return self._create_string_wrapped(start, i, nonascii)
+
+ def _create_string_wrapped(self, start, end, nonascii):
+ content = self.getslice(start, end)
+ if nonascii:
+ # contains non-ascii chars, we need to check that it's valid utf-8
lgt = unicodehelper.check_utf8_or_raise(self.space,
- content_utf8)
- return self.space.newutf8(content_utf8, lgt)
+ content)
else:
- # ascii only, fast path (ascii is a strict subset of
- # latin1, and we already checked that all the chars are <
- # 128)
lgt = end - start
- assert lgt >= 0
- return self.space.newutf8(self.getslice(start, end), lgt)
+ return self.space.newutf8(content, lgt)
def _create_dict(self, d):
from pypy.objspace.std.dictmultiobject import from_unicode_key_dict
@@ -314,8 +441,7 @@
from pypy.objspace.std.dictmultiobject import create_empty_unicode_key_dict
return create_empty_unicode_key_dict(self.space)
-
- def decode_string_escaped(self, start):
+ def decode_string_escaped(self, start, nonascii):
i = self.pos
builder = StringBuilder((i - start) * 2) # just an estimate
assert start >= 0
@@ -326,25 +452,21 @@
i += 1
if ch == '"':
content_utf8 = builder.build()
- lgt = unicodehelper.check_utf8_or_raise(self.space,
+ length = unicodehelper.check_utf8_or_raise(self.space,
content_utf8)
self.pos = i
- return self.space.newutf8(content_utf8, lgt)
+ return self.space.newutf8(content_utf8, length)
elif ch == '\\':
- i = self.decode_escape_sequence(i, builder)
+ i = self.decode_escape_sequence_to_utf8(i, builder)
elif ch < '\x20':
- if ch == '\0':
- raise DecoderError("Unterminated string starting at",
- start - 1)
- else:
- raise DecoderError("Invalid control character at", i-1)
+ self._raise_control_char_in_string(ch, start, i)
else:
builder.append(ch)
- def decode_escape_sequence(self, i, builder):
+ def decode_escape_sequence_to_utf8(self, i, stringbuilder):
ch = self.ll_chars[i]
i += 1
- put = builder.append
+ put = stringbuilder.append
if ch == '\\': put('\\')
elif ch == '"': put('"' )
elif ch == '/': put('/' )
@@ -354,22 +476,37 @@
elif ch == 'r': put('\r')
elif ch == 't': put('\t')
elif ch == 'u':
- return self.decode_escape_sequence_unicode(i, builder)
+ # may be a surrogate pair
+ return self.decode_escape_sequence_unicode(i, stringbuilder)
else:
raise DecoderError("Invalid \\escape: %s" % ch, i-1)
return i
+ def _get_int_val_from_hex4(self, i):
+ ll_chars = self.ll_chars
+ res = 0
+ for i in range(i, i + 4):
+ ch = ord(ll_chars[i])
+ if ord('a') <= ch <= ord('f'):
+ digit = ch - ord('a') + 10
+ elif ord('A') <= ch <= ord('F'):
+ digit = ch - ord('A') + 10
+ elif ord('0') <= ch <= ord('9'):
+ digit = ch - ord('0')
+ else:
+ raise ValueError
+ res = (res << 4) + digit
+ return res
+
def decode_escape_sequence_unicode(self, i, builder):
# at this point we are just after the 'u' of the \u1234 sequence.
start = i
i += 4
- hexdigits = self.getslice(start, i)
try:
- val = int(hexdigits, 16)
+ val = self._get_int_val_from_hex4(start)
if (0xd800 <= val <= 0xdbff and
self.ll_chars[i] == '\\' and self.ll_chars[i+1] == 'u'):
- hexdigits = self.getslice(i+2, i+6)
- lowsurr = int(hexdigits, 16)
+ lowsurr = self._get_int_val_from_hex4(i + 2)
if 0xdc00 <= lowsurr <= 0xdfff:
# decode surrogate pair
val = 0x10000 + (((val - 0xd800) << 10) |
@@ -384,45 +521,585 @@
builder.append(utf8_ch)
return i
- def decode_key(self, i):
- """ returns a wrapped unicode """
- from rpython.rlib.rarithmetic import intmask
- i = self.skip_whitespace(i)
+ def decode_string(self, i, contextmap=None):
+ """ Decode a string at position i (which is right after the opening ").
+ Optionally pass a contextmap, if the value is decoded as the value of a
+ dict. """
ll_chars = self.ll_chars
+ start = i
+ ch = ll_chars[i]
+ if ch == '"':
+ self.pos = i + 1
+ return self.w_empty_string # surprisingly common
+
+ cache = True
+ if contextmap is not None:
+ contextmap.decoded_strings += 1
+ if not contextmap.should_cache_strings():
+ cache = False
+ if len(self.s) < self.MIN_SIZE_FOR_STRING_CACHE:
+ cache = False
+
+ if not cache:
+ return self.decode_string_uncached(i)
+
+ strhash, nonascii, i = simd.find_end_of_string(ll_chars, i, len(self.s))
+ ch = ll_chars[i]
+ if ch == '\\':
+ self.pos = i
+ return self.decode_string_escaped(start, nonascii)
+ if ch < '\x20':
+ self._raise_control_char_in_string(ch, start, i)
+ else:
+ assert ch == '"'
+
+ self.pos = i + 1
+
+ length = i - start
+ strhash ^= length
+
+ # check cache first:
+ try:
+ entry = self.cache_wrapped[strhash]
+ except KeyError:
+ w_res = self._create_string_wrapped(start, i, nonascii)
+ # only add *some* strings to the cache, because keeping them all is
+ # way too expensive
+ if ((contextmap is not None and
+ contextmap.decoded_strings < self.STRING_CACHE_EVALUATION_SIZE) or
+ strhash in self.lru_cache):
+ entry = CacheEntry(
+ self.getslice(start, start + length), w_res)
+ self.cache_wrapped[strhash] = entry
+ else:
+ self.lru_cache[self.lru_index] = strhash
+ self.lru_index = (self.lru_index + 1) & self.LRU_MASK
+ return w_res
+ if not entry.compare(ll_chars, start, length):
+ # collision! hopefully rare
+ return self._create_string_wrapped(start, i, nonascii)
+ if contextmap is not None:
+ contextmap.cache_hits += 1
+ return entry.w_uni
+
+ def decode_key_map(self, i, currmap):
+ """ Given the current map currmap of an object, decode the next key at
+ position i. This returns the new map of the object. """
+ newmap = self._decode_key_map(i, currmap)
+ currmap.observe_transition(newmap, self.startmap)
+ return newmap
+
+ def _decode_key_map(self, i, currmap):
+ ll_chars = self.ll_chars
+ # first try to see whether we happen to find currmap.single_nextmap
+ nextmap = currmap.fast_path_key_parse(self, i)
+ if nextmap is not None:
+ return nextmap
+
+ start = i
ch = ll_chars[i]
if ch != '"':
raise DecoderError("Key name must be string at char", i)
i += 1
+ w_key = self._decode_key_string(i)
+ return currmap.get_next(w_key, self.s, start, self.pos, self.startmap)
+ def _decode_key_string(self, i):
+ """ decode key at position i as a string. Key strings are always
+ cached, since they repeat a lot. """
+ ll_chars = self.ll_chars
start = i
- bits = 0
- strhash = ord(ll_chars[i]) << 7
- while True:
- ch = ll_chars[i]
+
+ strhash, nonascii, i = simd.find_end_of_string(ll_chars, i, len(self.s))
+
+ ch = ll_chars[i]
+ if ch == '\\':
+ self.pos = i
+ w_key = self.decode_string_escaped(start, nonascii)
+ return w_key
+ if ch < '\x20':
+ self._raise_control_char_in_string(ch, start, i)
+ length = i - start
+ strhash ^= length
+ self.pos = i + 1
+ # check cache first:
+ try:
+ entry = self.cache[strhash]
+ except KeyError:
+ w_res = self._create_string_wrapped(start, i, nonascii)
+ entry = CacheEntry(
+ self.getslice(start, start + length), w_res)
+ self.cache[strhash] = entry
+ return w_res
+ if not entry.compare(ll_chars, start, length):
+ # collision! hopefully rare
+ w_res = self._create_string_wrapped(start, i, nonascii)
+ else:
+ w_res = entry.w_uni
+ return w_res
+
+ def decode_key_string(self, i):
+ ll_chars = self.ll_chars
+ ch = ll_chars[i]
+ if ch != '"':
+ self._raise("Key name must be string at char %d", i)
+ i += 1
+ return self._decode_key_string(i)
+
+class CacheEntry(object):
+ """ A cache entry, bundling the encoded version of a string, and its wrapped
+ decoded variant. """
+ def __init__(self, repr, w_uni):
+ # repr is the escaped string
+ self.repr = repr
+ # uni is the wrapped decoded string
+ self.w_uni = w_uni
+
+ def compare(self, ll_chars, start, length):
+ """ Check whether self.repr occurs at ll_chars[start:start+length] """
+ if length != len(self.repr):
+ return False
+ index = start
+ for c in self.repr:
+ if not ll_chars[index] == c:
+ return False
+ index += 1
+ return True
+
+
+class MapBase(object):
+ """ A map implementation to speed up parsing of json dicts, and to
+ represent the resulting dicts more compactly and make access faster. """
+
+ # the basic problem we are trying to solve is the following: dicts in
+ # json can either be used as objects, or as dictionaries with arbitrary
+ # string keys. We want to use maps for the former, but not for the
+ # latter. But we don't know in advance which kind of dict is which.
+
+ # Therefore we create "preliminary" maps where we aren't quite sure yet
+ # whether they are really useful maps or not. If we see them used often
+ # enough, we promote them to "useful" maps, which we will actually
+ # instantiate objects with.
+
+ # If we determine that a map is not used often enough, we can turn it
+ # into a "blocked" map, which is a point in the map tree where we will
+ # switch to regular dicts, when we reach that part of the tree.
+
+ # One added complication: We want to keep the number of preliminary maps
+ # bounded to prevent generating tons of useless maps. but also not too
+ # small, to support having a json file that contains many uniform objects
+ # with tons of keys. That's where the idea of "fringe" maps comes into
+ # play. They are maps that sit between known useful nodes and preliminary
+ # nodes in the map transition tree. We bound only the number of fringe
+ # nodes we are considering (to MAX_FRINGE), but not the number of
+ # preliminary maps. When we have too many fringe maps, we remove the least
+ # commonly instantiated fringe map and mark it as blocked.
+
+ # allowed graph edges or nodes in all_next:
+ # USEFUL -------
+ # / \ \
+ # v v v
+ # FRINGE USEFUL BLOCKED
+ # |
+ # v
+ # PRELIMINARY
+ # |
+ # v
+ # PRELIMINARY
+
+ # state transitions:
+ # PRELIMINARY
+ # / | \
+ # | v v
+ # | FRINGE -> USEFUL
+ # | |
+ # \ |
+ # v v
+ # BLOCKED
+
+ # the single_nextmap edge can only be these graph edges:
+ # USEFUL
+ # |
+ # v
+ # USEFUL
+ #
+ # FRINGE
+ # |
+ # v
+ # PRELIMINARY
+ # |
+ # v
+ # PRELIMINARY
+
+ USEFUL = 'u'
+ PRELIMINARY = 'p'
+ FRINGE = 'f' # buffer between PRELIMINARY and USEFUL
+ BLOCKED = 'b'
+
+ # tunable parameters
+ MAX_FRINGE = 40
+ USEFUL_THRESHOLD = 5
+
+ def __init__(self, space):
+ self.space = space
+
+ # a single transition is stored in .single_nextmap
+ self.single_nextmap = None
+
+ # all_next is only initialized after seeing the *second* transition
+ # but then it also contains .single_nextmap
+ self.all_next = None # later dict {key: nextmap}
+
+ # keep some statistics about every map: how often it was instantiated
+ # and how many non-blocked leaves the map transition tree has, starting
+ # from self
+ self.instantiation_count = 0
+ self.number_of_leaves = 1
+
+ def _check_invariants(self):
+ if self.all_next:
+ for next in self.all_next.itervalues():
+ next._check_invariants()
+ elif self.single_nextmap:
+ self.single_nextmap._check_invariants()
+
+ def get_next(self, w_key, string, start, stop, terminator):
+ from pypy.objspace.std.dictmultiobject import unicode_hash, unicode_eq
+ if isinstance(self, JSONMap):
+ assert not self.state == MapBase.BLOCKED
+ single_nextmap = self.single_nextmap
+ if (single_nextmap is not None and
+ single_nextmap.w_key.eq_w(w_key)):
+ return single_nextmap
+
+ assert stop >= 0
+ assert start >= 0
+
+ if single_nextmap is None:
+ # first transition ever seen, don't initialize all_next
+ next = self._make_next_map(w_key, string[start:stop])
+ self.single_nextmap = next
+ else:
+ if self.all_next is None:
+ self.all_next = objectmodel.r_dict(unicode_eq, unicode_hash,
+ force_non_null=True, simple_hash_eq=True)
+ self.all_next[single_nextmap.w_key] = single_nextmap
+ else:
+ next = self.all_next.get(w_key, None)
+ if next is not None:
+ return next
+ next = self._make_next_map(w_key, string[start:stop])
+ self.all_next[w_key] = next
+
+ # one new leaf has been created
+ self.change_number_of_leaves(1)
+
+ terminator.register_potential_fringe(next)
+ return next
+
+ def change_number_of_leaves(self, difference):
+ """ add difference to .number_of_leaves of self and its parents """
+ if not difference:
+ return
+ parent = self
+ while isinstance(parent, JSONMap):
+ parent.number_of_leaves += difference
+ parent = parent.prev
+ parent.number_of_leaves += difference # terminator
+
+ def fast_path_key_parse(self, decoder, position):
+ """ Fast path when parsing the next key: We speculate that we will
+ always see a commonly seen next key, and use strcmp (implemented in
+ key_repr_cmp) to check whether that is the case. """
+ single_nextmap = self.single_nextmap
+ if single_nextmap:
+ ll_chars = decoder.ll_chars
+ assert isinstance(single_nextmap, JSONMap)
+ if single_nextmap.key_repr_cmp(ll_chars, position):
+ decoder.pos = position + len(single_nextmap.key_repr)
+ return single_nextmap
+
+ def observe_transition(self, newmap, terminator):
+ """ observe a transition from self to newmap.
+ This does a few things, including updating the self size estimate with
+ the knowledge that one object transitioned from self to newmap.
+ also it potentially decides that self should move to state USEFUL."""
+ newmap.instantiation_count += 1
+ if isinstance(self, JSONMap) and self.state == MapBase.FRINGE:
+ if self.is_useful():
+ self.mark_useful(terminator)
+
+ def _make_next_map(self, w_key, key_repr):
+ return JSONMap(self.space, self, w_key, key_repr)
+
+ def _all_dot(self, output):
+ identity = objectmodel.compute_unique_id(self)
+ output.append('%s [shape=box%s];' % (identity, self._get_dot_text()))
+ if self.all_next:
+ for w_key, value in self.all_next.items():
+ assert isinstance(value, JSONMap)
+ if value is self.single_nextmap:
+ color = ", color=blue"
+ else:
+ color = ""
+ output.append('%s -> %s [label="%s"%s];' % (
+ identity, objectmodel.compute_unique_id(value), value.w_key._utf8, color))
+ value._all_dot(output)
+ elif self.single_nextmap is not None:
+ value = self.single_nextmap
+ output.append('%s -> %s [label="%s", color=blue];' % (
+ identity, objectmodel.compute_unique_id(value), value.w_key._utf8))
+ value._all_dot(output)
+
+
+ def _get_dot_text(self):
+ return ", label=base"
+
+ def view(self):
+ from dotviewer import graphclient
+ import pytest
+ r = ["digraph G {"]
+ self._all_dot(r)
+ r.append("}")
+ p = pytest.ensuretemp("jsonmap").join("temp.dot")
+ p.write("\n".join(r))
+ graphclient.display_dot_file(str(p))
+
+
+class Terminator(MapBase):
+ """ The root node of the map transition tree. """
+ def __init__(self, space):
+ MapBase.__init__(self, space)
+ # a set of all map nodes that are currently in the FRINGE state
+ self.current_fringe = {}
+
+ def register_potential_fringe(self, prelim):
+ """ add prelim to the fringe, if its prev is either a Terminator or
+ useful. """
+ prev = prelim.prev
+ if (isinstance(prev, Terminator) or
+ isinstance(prev, JSONMap) and prev.state == MapBase.USEFUL):
+ assert prelim.state == MapBase.PRELIMINARY
+ prelim.state = MapBase.FRINGE
+
+ if len(self.current_fringe) > MapBase.MAX_FRINGE:
+ self.cleanup_fringe()
+ self.current_fringe[prelim] = None
+
+ def remove_from_fringe(self, former_fringe):
+ """ Remove former_fringe from self.current_fringe. """
+ assert former_fringe.state in (MapBase.USEFUL, MapBase.BLOCKED)
+ del self.current_fringe[former_fringe]
+
+ def cleanup_fringe(self):
+ """ remove the least-instantiated fringe map and block it."""
+ min_fringe = None
+ min_avg = 1e200
+ for f in self.current_fringe:
+ assert f.state == MapBase.FRINGE
+ avg = f.average_instantiation()
+ if avg < min_avg:
+ min_avg = avg
+ min_fringe = f
+ assert min_fringe
+ min_fringe.mark_blocked(self)
+
+ def _check_invariants(self):
+ for fringe in self.current_fringe:
+ assert fringe.state == MapBase.FRINGE
+
+class JSONMap(MapBase):
+ """ A map implementation to speed up parsing """
+
+ def __init__(self, space, prev, w_key, key_repr):
+ MapBase.__init__(self, space)
+
+ self.prev = prev
+ self.w_key = w_key
+ self.key_repr = key_repr
+
+ self.state = MapBase.PRELIMINARY
+
+ # key decoding stats
+ self.decoded_strings = 0
+ self.cache_hits = 0
+
+ # for jsondict support
+ self.key_to_index = None
+ self.keys_in_order = None
+ self.strategy_instance = None
+
+ def __repr__(self):
+ return "<JSONMap key_repr=%s #instantiation=%s #leaves=%s prev=%r>" % (
+ self.key_repr, self.instantiation_count, self.number_of_leaves, self.prev)
+
+ def _get_terminator(self): # only for _check_invariants
+ while isinstance(self, JSONMap):
+ self = self.prev
+ assert isinstance(self, Terminator)
+ return self
+
+ def _check_invariants(self):
+ assert self.state in (
+ MapBase.USEFUL,
+ MapBase.PRELIMINARY,
+ MapBase.FRINGE,
+ MapBase.BLOCKED,
+ )
+
+ prev = self.prev
+ if isinstance(prev, JSONMap):
+ prevstate = prev.state
+ else:
+ prevstate = MapBase.USEFUL
+
+ if prevstate == MapBase.USEFUL:
+ assert self.state != MapBase.PRELIMINARY
+ elif prevstate == MapBase.PRELIMINARY:
+ assert self.state == MapBase.PRELIMINARY
+ elif prevstate == MapBase.FRINGE:
+ assert self.state == MapBase.PRELIMINARY
+ else:
+ # if prevstate is BLOCKED, we shouldn't have recursed here!
+ assert False, "should be unreachable"
+
+ if self.state == MapBase.BLOCKED:
+ assert self.single_nextmap is None
+ assert self.all_next is None
+ elif self.state == MapBase.FRINGE:
+ assert self in self._get_terminator().current_fringe
+
+ MapBase._check_invariants(self)
+
+ def mark_useful(self, terminator):
+ """ mark self as useful, and also the most commonly instantiated
+ children, recursively """
+ was_fringe = self.state == MapBase.FRINGE
+ assert self.state in (MapBase.FRINGE, MapBase.PRELIMINARY)
+ self.state = MapBase.USEFUL
+ if was_fringe:
+ terminator.remove_from_fringe(self)
+ # find the most commonly instantiated child, store it into
+ # single_nextmap and mark it useful, recursively
+ maxchild = self.single_nextmap
+ if self.all_next is not None:
+ for child in self.all_next.itervalues():
+ if child.instantiation_count > maxchild.instantiation_count:
+ maxchild = child
+ if maxchild is not None:
+ maxchild.mark_useful(terminator)
+ if self.all_next:
+ for child in self.all_next.itervalues():
+ if child is not maxchild:
+ terminator.register_potential_fringe(child)
+ self.single_nextmap = maxchild
+
+ def mark_blocked(self, terminator):
+ """ mark self and recursively all its children as blocked."""
+ was_fringe = self.state == MapBase.FRINGE
+ self.state = MapBase.BLOCKED
+ if was_fringe:
+ terminator.remove_from_fringe(self)
+ if self.all_next:
+ for next in self.all_next.itervalues():
+ next.mark_blocked(terminator)
+ elif self.single_nextmap:
+ self.single_nextmap.mark_blocked(terminator)
+ self.single_nextmap = None
+ self.all_next = None
+ self.change_number_of_leaves(-self.number_of_leaves + 1)
+
+ def is_state_blocked(self):
+ return self.state == MapBase.BLOCKED
+
+ def is_state_useful(self):
+ return self.state == MapBase.USEFUL
+
+ def average_instantiation(self):
+ """ the number of instantiations, divided by the number of leaves. We
+ want to favor nodes that have either a high instantiation count, or few
+ leaves below it. """
+ return self.instantiation_count / float(self.number_of_leaves)
+
+ def is_useful(self):
+ return self.average_instantiation() > self.USEFUL_THRESHOLD
+
+ def should_cache_strings(self):
+ """ return whether strings parsed in the context of this map should be
+ cached. """
+ # we should cache if either we've seen few strings so far (less than
+ # STRING_CACHE_EVALUATION_SIZE), or if we've seen many, and the cache
+ # hit rate has been high enough
+ return not (self.decoded_strings > JSONDecoder.STRING_CACHE_EVALUATION_SIZE and
+ self.cache_hits * JSONDecoder.STRING_CACHE_USEFULNESS_FACTOR < self.decoded_strings)
+
+ def key_repr_cmp(self, ll_chars, i):
+ for j, c in enumerate(self.key_repr):
+ if ll_chars[i] != c:
+ return False
i += 1
- if ch == '"':
- break
- elif ch == '\\' or ch < '\x20':
- self.pos = i-1
- return self.decode_string_escaped(start)
- strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
- bits |= ord(ch)
- length = i - start - 1
- if length == 0:
- strhash = -1
+ return True
+
+ # _____________________________________________________
+ # methods for JsonDictStrategy
+
+ @jit.elidable
+ def get_index(self, w_key):
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ assert isinstance(w_key, W_UnicodeObject)
+ return self.get_key_to_index().get(w_key, -1)
+
+ def get_key_to_index(self):
+ from pypy.objspace.std.dictmultiobject import unicode_hash, unicode_eq
+ key_to_index = self.key_to_index
+ if key_to_index is None:
+ key_to_index = self.key_to_index = objectmodel.r_dict(unicode_eq, unicode_hash,
+ force_non_null=True, simple_hash_eq=True)
+ # compute depth
+ curr = self
+ depth = 0
+ while True:
+ depth += 1
+ curr = curr.prev
+ if not isinstance(curr, JSONMap):
+ break
+
+ curr = self
+ while depth:
+ depth -= 1
+ key_to_index[curr.w_key] = depth
+ curr = curr.prev
+ if not isinstance(curr, JSONMap):
+ break
+ return key_to_index
+
+ def get_keys_in_order(self):
+ keys_in_order = self.keys_in_order
+ if keys_in_order is None:
+ key_to_index = self.get_key_to_index()
+ keys_in_order = self.keys_in_order = [None] * len(key_to_index)
+ for w_key, index in key_to_index.iteritems():
+ keys_in_order[index] = w_key
+ return keys_in_order
+
+ # _____________________________________________________
+
+ def _get_dot_text(self):
+ if self.all_next is None:
+ l = int(self.single_nextmap is not None)
else:
- strhash ^= length
- strhash = intmask(strhash)
- self.pos = i
- # check cache first:
- key = (ll_chars, start, length, strhash)
- try:
- return self.cache[key]
- except KeyError:
- pass
- res = self._create_string(start, i - 1, bits)
- self.cache[key] = res
+ l = len(self.all_next)
+ extra = ""
+ if self.decoded_strings:
+ extra = "\\n%s/%s (%s%%)" % (self.cache_hits, self.decoded_strings, self.cache_hits/float(self.decoded_strings))
+ res = ', label="#%s\\nchildren: %s%s"' % (self.instantiation_count, l, extra)
+ if self.state == MapBase.BLOCKED:
+ res += ", fillcolor=lightsalmon"
+ if self.state == MapBase.FRINGE:
+ res += ", fillcolor=lightgray"
+ if self.state == MapBase.PRELIMINARY:
+ res += ", fillcolor=lightslategray"
return res
def loads(space, w_s, w_errorcls=None):
@@ -443,3 +1120,4 @@
raise OperationError(w_errorcls, w_e)
finally:
decoder.close()
+
diff --git a/pypy/module/_pypyjson/simd.py b/pypy/module/_pypyjson/simd.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_pypyjson/simd.py
@@ -0,0 +1,218 @@
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib import objectmodel, unroll
+from rpython.rlib.rarithmetic import r_uint, intmask, LONG_BIT
+from rpython.jit.backend.detect_cpu import autodetect
+
+# accelerators for string operations using simd on regular word sizes (*not*
+# SSE instructions). this style is sometimes called SWAR (SIMD Within A
+# Register) or "broadword techniques"
+
+# XXX remove wordsize and endianness restrictions properly, so far only x86-64
+# is tested
+
+USE_SIMD = False
+if LONG_BIT == 64:
+ WORD_SIZE = 8
+ EVERY_BYTE_ONE = 0x0101010101010101
+ EVERY_BYTE_HIGHEST_BIT = 0x8080808080808080
+ if autodetect() == "x86-64":
+ USE_SIMD = True
+else:
+ WORD_SIZE = 4
+ EVERY_BYTE_ONE = 0x01010101
+ EVERY_BYTE_HIGHEST_BIT = 0x80808080
+
+
+# helpers
+
+unrolling_wordsize = unroll.unrolling_iterable(range(WORD_SIZE))
+
+def char_repeated_word_width(ch):
+ return r_uint(EVERY_BYTE_ONE) * ord(ch)
+
+def any_char_zero(word):
+ return (word - r_uint(EVERY_BYTE_ONE)) & ~word & r_uint(EVERY_BYTE_HIGHEST_BIT)
+
+def any_char_in_words_zero(*words):
+ return _any_char_in_any_word_zero_accum(0, *words)
+
+def _any_char_in_any_word_zero_accum(accum, word, *words):
+ accum |= (word - r_uint(EVERY_BYTE_ONE)) & ~word
+ if not words:
+ return accum & r_uint(EVERY_BYTE_HIGHEST_BIT)
+ return _any_char_in_any_word_zero_accum(accum, *words)
+
+def print_chars(word):
+ # for debugging
+ out = ''
+ for i in range(WORD_SIZE):
+ out += chr(word & 0xff)
+ word >>= 8
+ return out
+
+def index_nonzero(word):
+ # XXX can be done very cheap in theory
+ assert word
+ for i in unrolling_wordsize:
+ if word & 0xff:
+ return i
+ word >>= 8
+ assert 0
+
+def index_zero(word):
+ # XXX can be done very cheap in theory
+ assert any_char_zero(word)
+ for i in unrolling_wordsize:
+ if not word & 0xff:
+ return i
+ word >>= 8
+ assert 0 # XXX ???
+
+def splice_words(word, offset, other):
+ mask = ((~r_uint(0)) << (8 * offset))
+ return (word & mask) | (other & ~mask)
+
+
+
+ at objectmodel.always_inline
+def position_string_ender(word):
+ maskquote = char_repeated_word_width('"')
+ maskbackslash = char_repeated_word_width('\\')
+ maskx20 = char_repeated_word_width(chr(0xff - 0x1f))
+ # x1 and x2 check for equality, if a byte is 0 the corresponding
+ # char is equal to " or \
+ x1 = maskquote ^ word
+ x2 = maskbackslash ^ word
+ # x3 checks for char < 0x20, the byte is 0 in that case
+ x3 = maskx20 & word
+ return any_char_in_words_zero(x1, x2, x3)
+
+ at objectmodel.always_inline
+def find_end_of_string_simd_unaligned(ll_chars, startpos, length):
+ ch = ll_chars[startpos]
+ strhash = (ord(ch) << 7) ^ 0x345678
+
+ wordarray = rffi.cast(rffi.ULONGP, rffi.ptradd(ll_chars, startpos))
+ num_safe_reads = (length - startpos) // WORD_SIZE
+
+ bits = 0
+ for i in range(num_safe_reads):
+ word = wordarray[i]
+ cond = position_string_ender(word)
+ if cond:
+ break
+ bits |= word
+ strhash = intmask((1000003 * strhash) ^ intmask(word))
+ else:
+ # didn't find end of string yet, look at remaining chars
+ word = 0
+ shift = 0
+ i = 0
+ for i in range(num_safe_reads * WORD_SIZE + startpos, length + 1):
+ ch = ll_chars[i]
+ if ch == '"' or ch == '\\' or ch < '\x20':
+ break
+ bits |= ord(ch)
+ word |= ord(ch) << shift
+ shift += 8
+ if shift:
+ strhash = intmask((1000003 * strhash) ^ intmask(word))
+
+ nonascii = bool(bits & char_repeated_word_width(chr(0x80)))
+ return strhash, nonascii, i
+
+ # compute endposition
+ nonzero = index_nonzero(cond)
+ endposition = startpos + i * WORD_SIZE + nonzero
+ if nonzero:
+ word = splice_words(r_uint(0), nonzero, word)
+ bits |= word
+ strhash = intmask((1000003 * strhash) ^ intmask(word))
+
+ nonascii = bool(bits & char_repeated_word_width(chr(0x80)))
+
+ return strhash, nonascii, endposition
+
+ at objectmodel.always_inline
+def find_end_of_string_simd_unaligned_no_hash(ll_chars, startpos, length):
+ ch = ll_chars[startpos]
+
+ wordarray = rffi.cast(rffi.ULONGP, rffi.ptradd(ll_chars, startpos))
+ num_safe_reads = (length - startpos) // WORD_SIZE
+
+ bits = 0
+ for i in range(num_safe_reads):
+ word = wordarray[i]
+ cond = position_string_ender(word)
+ if cond:
+ break
+ bits |= word
+ else:
+ # didn't find end of string yet, look at remaining chars
+ word = 0
+ shift = 0
+ i = 0
+ for i in range(num_safe_reads * WORD_SIZE + startpos, length + 1):
+ ch = ll_chars[i]
+ if ch == '"' or ch == '\\' or ch < '\x20':
+ break
+ bits |= ord(ch)
+ word |= ord(ch) << shift
+ shift += WORD_SIZE
+
+ nonascii = bool(bits & char_repeated_word_width(chr(0x80)))
+ return nonascii, i
+
+ # compute endposition
+ nonzero = index_nonzero(cond)
+ endposition = startpos + i * WORD_SIZE + nonzero
+ if nonzero:
+ word = splice_words(r_uint(0), nonzero, word)
+ bits |= word
+
+ nonascii = bool(bits & char_repeated_word_width(chr(0x80)))
+
+ return nonascii, endposition
+
+
+ at objectmodel.always_inline
+def find_end_of_string_slow(ll_chars, i, length):
+ ch = ll_chars[i]
+ strhash = (ord(ch) << 7) ^ 0x345678
+ word = 0
+ shift = 0
+
+ bits = 0
+
+ while True:
+ # this loop is a fast path for strings which do not contain escape
+ # characters
+ ch = ll_chars[i]
+ if ch == '"' or ch == '\\' or ch < '\x20':
+ break
+ i += 1
+ bits |= ord(ch)
+
+ word |= ord(ch) << shift
+ shift += WORD_SIZE
+ if shift == WORD_SIZE * WORD_SIZE:
+ strhash = intmask((1000003 * strhash) ^ word)
+ shift = 0
+ word = 0
+
+ if shift:
+ strhash = intmask((1000003 * strhash) ^ word)
+ return strhash, bool(bits & 0x80), i
+
+if USE_SIMD:
+ find_end_of_string = find_end_of_string_simd_unaligned
+ find_end_of_string_no_hash = find_end_of_string_simd_unaligned_no_hash
+else:
+ find_end_of_string = find_end_of_string_slow
+
+ @objectmodel.always_inline
+ def find_end_of_string_no_hash(ll_chars, i, length):
+ _, nonascii, i = find_end_of_string_slow(ll_chars, i, length)
+ return (nonascii, i)
+
+
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -1,31 +1,251 @@
# -*- encoding: utf-8 -*-
-from pypy.module._pypyjson.interp_decoder import JSONDecoder
+import pytest
+from pypy.module._pypyjson.interp_decoder import JSONDecoder, Terminator, MapBase
+from rpython.rtyper.lltypesystem import lltype, rffi
-def test_skip_whitespace():
- s = ' hello '
- dec = JSONDecoder('fake space', s)
- assert dec.pos == 0
- assert dec.skip_whitespace(0) == 3
- assert dec.skip_whitespace(3) == 3
- assert dec.skip_whitespace(8) == len(s)
- dec.close()
-class FakeSpace(object):
- def newutf8(self, s, l):
- return s
+class TestJson(object):
+ def test_skip_whitespace(self):
+ s = ' hello '
+ dec = JSONDecoder(self.space, s)
+ assert dec.pos == 0
+ assert dec.skip_whitespace(0) == 3
+ assert dec.skip_whitespace(3) == 3
+ assert dec.skip_whitespace(8) == len(s)
+ dec.close()
-def test_decode_key():
- s1 = "123" * 100
- s = ' "%s" "%s" ' % (s1, s1)
- dec = JSONDecoder(FakeSpace(), s)
- assert dec.pos == 0
- x = dec.decode_key(0)
- assert x == s1
- # check caching
- y = dec.decode_key(dec.pos)
- assert y == s1
- assert y is x
- dec.close()
+ def test_json_map(self):
+ m = Terminator(self.space)
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_c = self.space.newutf8("c", 1)
+ m1 = m.get_next(w_a, '"a"', 0, 3, m)
+ assert m1.w_key == w_a
+ assert m1.single_nextmap is None
+ assert m1.key_repr == '"a"'
+ assert m1.key_repr_cmp('"a": 123', 0)
+ assert not m1.key_repr_cmp('b": 123', 0)
+ assert m.single_nextmap.w_key == w_a
+
+ m2 = m.get_next(w_a, '"a"', 0, 3, m)
+ assert m2 is m1
+
+ m3 = m.get_next(w_b, '"b"', 0, 3, m)
+ assert m3.w_key == w_b
+ assert m3.single_nextmap is None
+ assert m3.key_repr == '"b"'
+ assert m.single_nextmap is m1
+
+ m4 = m3.get_next(w_c, '"c"', 0, 3, m)
+ assert m4.w_key == w_c
+ assert m4.single_nextmap is None
+ assert m4.key_repr == '"c"'
+ assert m3.single_nextmap is m4
+
+ def test_json_map_get_index(self):
+ m = Terminator(self.space)
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_c = self.space.newutf8("c", 1)
+ m1 = m.get_next(w_a, 'a"', 0, 2, m)
+ assert m1.get_index(w_a) == 0
+ assert m1.get_index(w_b) == -1
+
+ m2 = m.get_next(w_b, 'b"', 0, 2, m)
+ assert m2.get_index(w_b) == 0
+ assert m2.get_index(w_a) == -1
+
+ m3 = m2.get_next(w_c, 'c"', 0, 2, m)
+ assert m3.get_index(w_b) == 0
+ assert m3.get_index(w_c) == 1
+ assert m3.get_index(w_a) == -1
+
+ def test_decode_key_map(self):
+ m = Terminator(self.space)
+ m_diff = Terminator(self.space)
+ for s1 in ["abc", "1001" * 10, u"ä".encode("utf-8")]:
+ s = ' "%s" "%s" "%s"' % (s1, s1, s1)
+ dec = JSONDecoder(self.space, s)
+ assert dec.pos == 0
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+ assert m1.w_key._utf8 == s1
+ assert m1.key_repr == '"%s"' % s1
+
+ # check caching on w_key level
+ m2 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
+ assert m1.w_key is m2.w_key
+
+ # check caching on map level
+ m3 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
+ assert m3 is m2
+ dec.close()
+
+ def test_decode_string_caching(self):
+ for s1 in ["abc", u"ä".encode("utf-8")]:
+ s = '"%s" "%s" "%s"' % (s1, s1, s1)
+ dec = JSONDecoder(self.space, s)
+ dec.MIN_SIZE_FOR_STRING_CACHE = 0
+ assert dec.pos == 0
+ w_x = dec.decode_string(1)
+ w_y = dec.decode_string(dec.skip_whitespace(dec.pos) + 1)
+ assert w_x is not w_y
+ # check caching
+ w_z = dec.decode_string(dec.skip_whitespace(dec.pos) + 1)
+ assert w_z is w_y
+ dec.close()
+
+ def _make_some_maps(self):
+ # base -> m1 -> m2 -> m3
+ # \-> m4
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_c = self.space.newutf8("c", 1)
+ w_d = self.space.newutf8("d", 1)
+ base = Terminator(self.space)
+ base.instantiation_count = 6
+ m1 = base.get_next(w_a, 'a"', 0, 2, base)
+ m2 = m1.get_next(w_b, 'b"', 0, 2, base)
+ m3 = m2.get_next(w_c, 'c"', 0, 2, base)
+ m4 = m2.get_next(w_d, 'd"', 0, 2, base)
+ return base, m1, m2, m3, m4
+
+ # unit tests for map state transistions
+ def test_fringe_to_useful(self):
+ base, m1, m2, m3, m4 = self._make_some_maps()
+ base.instantiation_count = 6
+ assert m1.state == MapBase.FRINGE
+ m1.instantiation_count = 6
+
+ assert m2.state == MapBase.PRELIMINARY
+ m2.instantiation_count = 6
+
+ assert m3.state == MapBase.PRELIMINARY
+ m3.instantiation_count = 2
+ assert m2.single_nextmap is m3
+
+ assert m4.state == MapBase.PRELIMINARY
+ m4.instantiation_count = 4
+
+ m1.mark_useful(base)
+ assert m1.state == MapBase.USEFUL
+ assert m2.state == MapBase.USEFUL
+ assert m3.state == MapBase.FRINGE
+ assert m4.state == MapBase.USEFUL
+ assert m2.single_nextmap is m4
+
+ assert m1.number_of_leaves == 2
+ base._check_invariants()
+
+ def test_number_of_leaves(self):
+ w_x = self.space.newutf8("x", 1)
+ base, m1, m2, m3, m4 = self._make_some_maps()
+ assert base.number_of_leaves == 2
+ assert m1.number_of_leaves == 2
+ assert m2.number_of_leaves == 2
+ assert m3.number_of_leaves == 1
+ assert m4.number_of_leaves == 1
+ m5 = m2.get_next(w_x, 'x"', 0, 2, base)
+ assert base.number_of_leaves == 3
+ assert m1.number_of_leaves == 3
+ assert m2.number_of_leaves == 3
+ assert m5.number_of_leaves == 1
+
+ def test_number_of_leaves_after_mark_blocked(self):
+ w_x = self.space.newutf8("x", 1)
+ base, m1, m2, m3, m4 = self._make_some_maps()
+ m5 = m2.get_next(w_x, 'x"', 0, 2, base)
+ assert base.number_of_leaves == 3
+ m2.mark_blocked(base)
+ assert base.number_of_leaves == 1
+
+ def test_mark_useful_cleans_fringe(self):
+ base, m1, m2, m3, m4 = self._make_some_maps()
+ base.instantiation_count = 6
+ assert m1.state == MapBase.FRINGE
+ m1.instantiation_count = 6
+ m2.instantiation_count = 6
+ m3.instantiation_count = 2
+ m4.instantiation_count = 4
+ assert base.current_fringe == {m1: None}
+
+ m1.mark_useful(base)
+ assert base.current_fringe == {m3: None}
+
+ def test_cleanup_fringe(self):
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_c = self.space.newutf8("c", 1)
+ w_d = self.space.newutf8("d", 1)
+ base = Terminator(self.space)
+ base.instantiation_count = 6
+ m1 = base.get_next(w_a, 'a"', 0, 2, base)
+ m2 = base.get_next(w_b, 'b"', 0, 2, base)
+ m3 = base.get_next(w_c, 'c"', 0, 2, base)
+ m4 = base.get_next(w_d, 'd"', 0, 2, base)
+ m5 = m4.get_next(w_a, 'a"', 0, 2, base)
+ base.instantiation_count = 7
+ m1.instantiation_count = 2
+ m2.instantiation_count = 2
+ m3.instantiation_count = 2
+ m4.instantiation_count = 1
+ m5.instantiation_count = 1
+ assert base.current_fringe == dict.fromkeys([m1, m2, m3, m4])
+
+ base.cleanup_fringe()
+ assert base.current_fringe == dict.fromkeys([m1, m2, m3])
+ assert m4.state == MapBase.BLOCKED
+ assert m4.single_nextmap is None
+ assert m4.all_next is None
+ assert m5.state == MapBase.BLOCKED
+ assert m5.single_nextmap is None
+ assert m5.all_next is None
+
+ def test_deal_with_blocked(self):
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_c = self.space.newutf8("c", 1)
+ space = self.space
+ s = '{"a": 1, "b": 2, "c": 3}'
+ dec = JSONDecoder(space, s)
+ dec.startmap = base = Terminator(space)
+ m1 = base.get_next(w_a, 'a"', 0, 2, base)
+ m2 = m1.get_next(w_b, 'b"', 0, 2, base)
+ m2.mark_blocked(base)
+ w_res = dec.decode_object(1)
+ assert space.int_w(space.len(w_res)) == 3
+ assert space.int_w(space.getitem(w_res, w_a)) == 1
+ assert space.int_w(space.getitem(w_res, w_b)) == 2
+ assert space.int_w(space.getitem(w_res, w_c)) == 3
+ dec.close()
+
+ def test_deal_with_blocked_number_of_leaves(self):
+ w_a = self.space.newutf8("a", 1)
+ w_b = self.space.newutf8("b", 1)
+ w_x = self.space.newutf8("x", 1)
+ w_u = self.space.newutf8("u", 1)
+ space = self.space
+ base = Terminator(space)
+ m1 = base.get_next(w_a, 'a"', 0, 2, base)
+ m2 = m1.get_next(w_b, 'b"', 0, 2, base)
+ m2.get_next(w_x, 'x"', 0, 2, base)
+ m2.get_next(w_u, 'u"', 0, 2, base)
+ assert base.number_of_leaves == 2
+ m2.mark_blocked(base)
+ assert base.number_of_leaves == 1
+
+ def test_instatiation_count(self):
+ m = Terminator(self.space)
+ dec = JSONDecoder(self.space, '"abc" "def"')
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+ m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+ m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+
+ assert m1.instantiation_count == 3
+ assert m2.instantiation_count == 2
+ dec.close()
+
class AppTest(object):
spaceconfig = {"usemodules": ['_pypyjson']}
@@ -106,6 +326,12 @@
s = r'"\u1234"'
assert _pypyjson.loads(s) == '\u1234'
+ def test_escape_sequence_mixed_with_unicode(self):
+ import _pypyjson
+ assert _pypyjson.loads(r'"abc\\' + u'ä"') == u'abc\\ä'
+ assert _pypyjson.loads(r'"abc\"' + u'ä"') == u'abc"ä'
+ assert _pypyjson.loads(r'"def\u1234' + u'ä"') == u'def\u1234ä'
+
def test_invalid_utf_8(self):
import _pypyjson
s = '"\xe0"' # this is an invalid UTF8 sequence inside a string
@@ -175,9 +401,14 @@
s = '{"hello": "world", "aaa": "bbb"}'
assert _pypyjson.loads(s) == {'hello': 'world',
'aaa': 'bbb'}
+ assert _pypyjson.loads(s) == {'hello': 'world',
+ 'aaa': 'bbb'}
raises(ValueError, _pypyjson.loads, '{"key"')
raises(ValueError, _pypyjson.loads, '{"key": 42')
+ assert _pypyjson.loads('{"neighborhood": ""}') == {
+ "neighborhood": ""}
+
def test_decode_object_nonstring_key(self):
import _pypyjson
raises(ValueError, "_pypyjson.loads('{42: 43}')")
@@ -248,11 +479,11 @@
def test_error_position(self):
import _pypyjson
test_cases = [
- ('[,', "Unexpected ',' at", 1),
- ('{"spam":[}', "Unexpected '}' at", 9),
+ ('[,', "Unexpected ','", 1),
+ ('{"spam":[}', "Unexpected '}'", 9),
('[42:', "Unexpected ':' when decoding array", 3),
('[42 "spam"', "Unexpected '\"' when decoding array", 4),
- ('[42,]', "Unexpected ']' at", 4),
+ ('[42,]', "Unexpected ']'", 4),
('{"spam":[42}', "Unexpected '}' when decoding array", 11),
('["]', 'Unterminated string starting at', 1),
('["spam":', "Unexpected ':' when decoding array", 7),
@@ -275,4 +506,4 @@
class MyError(Exception):
pass
exc = raises(MyError, _pypyjson.loads, 'nul', MyError)
- assert exc.value.args == ('Error when decoding null at', 'nul', 1)
+ assert exc.value.args == ('Error when decoding null', 'nul', 1)
diff --git a/pypy/module/_pypyjson/test/test_simd.py b/pypy/module/_pypyjson/test/test_simd.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_pypyjson/test/test_simd.py
@@ -0,0 +1,107 @@
+import sys
+import pytest
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint, intmask
+
+from pypy.module._pypyjson.simd import USE_SIMD
+from pypy.module._pypyjson.simd import find_end_of_string_slow
+from pypy.module._pypyjson.simd import print_chars
+from pypy.module._pypyjson.simd import find_end_of_string_simd_unaligned, WORD_SIZE
+from pypy.module._pypyjson.simd import find_end_of_string_simd_unaligned_no_hash
+
+try:
+ from hypothesis import example, given, strategies
+except ImportError:
+ pytest.skip("missing hypothesis!")
+
+if not USE_SIMD:
+ pytest.skip("only implemented for 64 bit for now")
+
+def fill_to_word_size(res, ch=" "):
+ if len(res) % WORD_SIZE != 0:
+ res += ch * (WORD_SIZE - (len(res) % WORD_SIZE))
+ return res
+
+def string_to_word(s):
+ assert len(s) == WORD_SIZE
+ ll_chars, flag = rffi.get_nonmovingbuffer_final_null(s)
+ try:
+ wordarray = rffi.cast(rffi.ULONGP, ll_chars)
+ return wordarray[0]
+ finally:
+ rffi.free_nonmovingbuffer(s, ll_chars, flag)
+
+def ll(callable, string, *args):
+ ll_chars, flag = rffi.get_nonmovingbuffer_final_null(string)
+ try:
+ return callable(ll_chars, *args)
+ finally:
+ rffi.free_nonmovingbuffer(string, ll_chars, flag)
+
+word = strategies.builds(
+ r_uint, strategies.integers(min_value=-sys.maxint-1, max_value=sys.maxint))
+
+def build_string(prefix, content, end, suffix):
+ res = prefix + '"' + "".join([chr(x) for x in content]) + end + suffix
+ return fill_to_word_size(res), len(prefix) + 1
+
+string_in_context_strategy = strategies.builds(
+ build_string, prefix=strategies.binary(),
+ content=strategies.lists(strategies.integers(1, 255), min_size=1),
+ end=strategies.sampled_from('"\\\x00\x01'),
+ suffix=strategies.binary())
+
+def compare(string, res1, res2):
+ hash1, nonascii1, endindex1 = res1
+ hash2, nonascii2, endindex2 = res2
+ assert endindex1 == endindex2
+ if string[endindex1 - 1] == '"':
+ assert hash1 == hash2
+ assert nonascii1 == nonascii2
+
+
+ at example(('" \x80" ', 1))
+ at example(('"\x01" ', 1))
+ at example(('"aaaaaaaa"\x00\x00\x00\x00\x00\x00\x00 ', 1))
+ at example(('"aaaaaaaa" ', 1))
+ at example(('"12"', 1))
+ at example(('"1234567abcdefghAB"', 1))
+ at example(('"1234567abcdefgh"', 1))
+ at example((' "123456ABCDEF" \x00', 2))
+ at example((' "123456aaaaaaaaABCDEF"\x00', 2))
+ at given(string_in_context_strategy)
+def test_find_end_of_string(a):
+ (string, startindex) = a
+ res = ll(find_end_of_string_slow, string, startindex, len(string))
+ hash, nonascii1, endposition1 = res
+ ch = string[endposition1]
+ assert ch == '"' or ch == '\\' or ch < '\x20'
+ for ch in string[startindex:endposition1]:
+ assert not (ch == '"' or ch == '\\' or ch < '\x20')
+ compare(string, res, ll(find_end_of_string_simd_unaligned, string, startindex, len(string)))
+
+ nonascii2, endposition2 = ll(find_end_of_string_simd_unaligned_no_hash, string, startindex, len(string))
+ assert nonascii1 == nonascii2
+ assert endposition1 == endposition2
+
+ at given(string_in_context_strategy, strategies.binary(min_size=1))
+def test_find_end_of_string_position_invariance(a, prefix):
+ fn = find_end_of_string_simd_unaligned
+ (string, startindex) = a
+ h1, nonascii1, i1 = ll(fn, string, startindex, len(string))
+ string2 = prefix + string
+ h2, nonascii2, i2 = ll(fn, string2, startindex + len(prefix), len(string) + len(prefix))
+ assert h1 == h2
+ assert nonascii1 == nonascii2
+ assert i1 + len(prefix) == i2
+
+ at given(string_in_context_strategy, strategies.binary(min_size=1))
+def test_find_end_of_string_position_invariance_no_hash(a, prefix):
+ fn = find_end_of_string_simd_unaligned_no_hash
+ (string, startindex) = a
+ nonascii1, i1 = ll(fn, string, startindex, len(string))
+ string2 = prefix + string
+ nonascii2, i2 = ll(fn, string2, startindex + len(prefix), len(string) + len(prefix))
+ assert nonascii1 == nonascii2
+ assert i1 + len(prefix) == i2
+
diff --git a/pypy/objspace/std/jsondict.py b/pypy/objspace/std/jsondict.py
new file mode 100644
--- /dev/null
+++ b/pypy/objspace/std/jsondict.py
@@ -0,0 +1,167 @@
+"""dict implementation specialized for object loaded by the _pypyjson module.
+
+Somewhat similar to MapDictStrategy, also uses a map.
+"""
+
+from rpython.rlib import jit, rerased, objectmodel, debug
+
+from pypy.objspace.std.dictmultiobject import (
+ UnicodeDictStrategy, DictStrategy,
+ create_iterator_classes, W_DictObject)
+
+
+def from_values_and_jsonmap(space, values_w, jsonmap):
+ if not objectmodel.we_are_translated():
+ assert len(values_w) == len(jsonmap.get_keys_in_order())
+ assert len(values_w) != 0
+ debug.make_sure_not_resized(values_w)
+ strategy = jsonmap.strategy_instance
+ if strategy is None:
+ jsonmap.strategy_instance = strategy = JsonDictStrategy(space, jsonmap)
+ storage = strategy.erase(values_w)
+ return W_DictObject(space, strategy, storage)
+
+def devolve_jsonmap_dict(w_dict):
+ assert isinstance(w_dict, W_DictObject)
+ strategy = w_dict.get_strategy()
+ assert isinstance(strategy, JsonDictStrategy)
+ strategy.switch_to_unicode_strategy(w_dict)
+
+def get_jsonmap_from_dict(w_dict):
+ assert isinstance(w_dict, W_DictObject)
+ strategy = w_dict.get_strategy()
+ assert isinstance(strategy, JsonDictStrategy)
+ return strategy.jsonmap
+
+class JsonDictStrategy(DictStrategy):
+ erase, unerase = rerased.new_erasing_pair("jsondict")
+ erase = staticmethod(erase)
+ unerase = staticmethod(unerase)
+
+ _immutable_fields_ = ['jsonmap']
+
+ def __init__(self, space, jsonmap):
+ DictStrategy.__init__(self, space)
+ self.jsonmap = jsonmap
+
+ def wrap(self, w_key):
+ return w_key
+
+ def wrapkey(space, key):
+ return key
+
+ def get_empty_storage(self):
+ raise NotImplementedError("should not be reachable")
+
+ def is_correct_type(self, w_obj):
+ space = self.space
+ return space.is_w(space.type(w_obj), space.w_unicode)
+
+ def _never_equal_to(self, w_lookup_type):
+ return False
+
+ def length(self, w_dict):
+ return len(self.unerase(w_dict.dstorage))
+
+ def getitem(self, w_dict, w_key):
+ if self.is_correct_type(w_key):
+ return self.getitem_unicode(w_dict, w_key)
+ else:
+ self.switch_to_unicode_strategy(w_dict)
+ return w_dict.getitem(w_key)
+
+ def getitem_unicode(self, w_dict, w_key):
+ storage_w = self.unerase(w_dict.dstorage)
+ if jit.isconstant(w_key):
+ jit.promote(self)
+ index = self.jsonmap.get_index(w_key)
+ if index == -1:
+ return None
+ return storage_w[index]
+
+ def setitem(self, w_dict, w_key, w_value):
+ if self.is_correct_type(w_key):
+ storage_w = self.unerase(w_dict.dstorage)
+ index = self.jsonmap.get_index(w_key)
+ if index != -1:
+ storage_w[index] = w_value
+ return
+ self.switch_to_unicode_strategy(w_dict)
+ w_dict.setitem(w_key, w_value)
+
+ def setdefault(self, w_dict, w_key, w_default):
+ if self.is_correct_type(w_key):
+ w_result = self.getitem_unicode(w_dict, w_key)
+ if w_result is not None:
+ return w_result
+ self.switch_to_unicode_strategy(w_dict)
+ return w_dict.setdefault(w_key, w_default)
+
+ def delitem(self, w_dict, w_key):
+ self.switch_to_unicode_strategy(w_dict)
+ return w_dict.delitem(w_key)
+
+ def popitem(self, w_dict):
+ self.switch_to_unicode_strategy(w_dict)
+ return w_dict.popitem()
+
+ def switch_to_unicode_strategy(self, w_dict):
+ strategy = self.space.fromcache(UnicodeDictStrategy)
+ values_w = self.unerase(w_dict.dstorage)
+ storage = strategy.get_empty_storage()
+ d_new = strategy.unerase(storage)
+ keys_in_order = self.jsonmap.get_keys_in_order()
+ assert len(keys_in_order) == len(values_w)
+ for index, w_key in enumerate(keys_in_order):
+ assert w_key is not None
+ assert type(w_key) is self.space.UnicodeObjectCls
+ d_new[w_key] = values_w[index]
+ w_dict.set_strategy(strategy)
+ w_dict.dstorage = storage
+
+ def w_keys(self, w_dict):
+ return self.space.newlist(self.jsonmap.get_keys_in_order())
+
+ def values(self, w_dict):
+ return self.unerase(w_dict.dstorage)[:] # to make resizable
+
+ def items(self, w_dict):
+ space = self.space
+ storage_w = self.unerase(w_dict.dstorage)
+ res = [None] * len(storage_w)
+ for index, w_key in enumerate(self.jsonmap.get_keys_in_order()):
+ res[index] = space.newtuple([w_key, storage_w[index]])
+ return res
+
+ def getiterkeys(self, w_dict):
+ return iter(self.jsonmap.get_keys_in_order())
+
+ def getitervalues(self, w_dict):
+ storage_w = self.unerase(w_dict.dstorage)
+ return iter(storage_w)
+
+ def getiteritems_with_hash(self, w_dict):
+ storage_w = self.unerase(w_dict.dstorage)
+ return ZipItemsWithHash(self.jsonmap.get_keys_in_order(), storage_w)
+
+
+class ZipItemsWithHash(object):
+ def __init__(self, list1, list2):
+ assert len(list1) == len(list2)
+ self.list1 = list1
+ self.list2 = list2
+ self.i = 0
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ i = self.i
+ if i >= len(self.list1):
+ raise StopIteration
+ self.i = i + 1
+ w_key = self.list1[i]
+ return (w_key, self.list2[i], w_key.hash_w())
+
+
+create_iterator_classes(JsonDictStrategy)
diff --git a/pypy/objspace/std/test/test_jsondict.py b/pypy/objspace/std/test/test_jsondict.py
new file mode 100644
--- /dev/null
+++ b/pypy/objspace/std/test/test_jsondict.py
@@ -0,0 +1,89 @@
+
+class AppTest(object):
+ spaceconfig = {"objspace.usemodules._pypyjson": True}
+
+ def test_check_strategy(self):
+ import __pypy__
+ import _pypyjson
+
+ d = _pypyjson.loads('{"a": 1}')
+ assert __pypy__.strategy(d) == "JsonDictStrategy"
+ d = _pypyjson.loads('{}')
+ assert __pypy__.strategy(d) == "EmptyDictStrategy"
+
+ def test_simple(self):
+ import __pypy__
+ import _pypyjson
+
+ d = _pypyjson.loads('{"a": 1, "b": "x"}')
+ assert len(d) == 2
More information about the pypy-commit
mailing list