[pypy-commit] pypy json-decoder-maps: cleanup instantiation_count and number_of_leaves, some comments, some renaming
cfbolz
pypy.commits at gmail.com
Wed Jun 5 16:00:01 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: json-decoder-maps
Changeset: r96760:ec6df04c89ba
Date: 2019-06-05 18:49 +0200
http://bitbucket.org/pypy/pypy/changeset/ec6df04c89ba/
Log: cleanup instantiation_count and number_of_leaves, some comments,
some renaming
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -24,17 +24,6 @@
return 0.0
return x * NEG_POW_10[exp]
-def _compare_cache_entry(space, res, ll_chars, start, length):
- if length != len(res):
- return False
- index = start
- for c in res:
- x = ord(c)
- if not ll_chars[index] == chr(x):
- return False
- index += 1
- return True
-
class IntCache(object):
START = -10
@@ -84,6 +73,9 @@
self.pos = 0
self.intcache = space.fromcache(IntCache)
+ # two caches, one for keys, one for general strings. they both have the
+ # form {hash-as-int: CacheEntry} and they don't deal with
+ # collisions at all. For every hash there is simply one string stored.
self.cache = {}
self.cache_wrapped = {}
@@ -339,7 +331,7 @@
currmap = self.startmap
while True:
# parse a key: value
- currmap = self.decode_key(i, currmap)
+ currmap = self.decode_key_map(i, currmap)
i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
if ch != ':':
@@ -359,11 +351,9 @@
self.pos = i
self.scratch.append(values_w) # can reuse next time
if currmap.is_state_blocked():
- currmap.instantiation_count += 1
dict_w = self._switch_to_dict(currmap, values_w, nextindex)
return self._create_dict(dict_w)
values_w = values_w[:nextindex]
- currmap.instantiation_count += 1
w_res = self._create_dict_map(values_w, currmap)
if not currmap.is_state_useful():
self.unclear_objects.append(w_res)
@@ -371,7 +361,6 @@
elif ch == ',':
i = self.skip_whitespace(i)
if currmap.is_state_blocked():
- currmap.instantiation_count += 1
self.scratch.append(values_w) # can reuse next time
dict_w = self._switch_to_dict(currmap, values_w, nextindex)
return self.decode_object_dict(i, start, dict_w)
@@ -582,27 +571,30 @@
if ((context is not None and
context.decoded_strings < self.STRING_CACHE_EVALUATION_SIZE) or
strhash in self.lru_cache):
- entry = WrappedCacheEntry(
+ entry = CacheEntry(
self.getslice(start, start + length), w_res)
self.cache_wrapped[strhash] = entry
else:
self.lru_cache[self.lru_index] = strhash
self.lru_index = (self.lru_index + 1) & self.LRU_MASK
return w_res
- if not _compare_cache_entry(self.space, entry.repr, ll_chars, start, length):
- # hopefully rare
+ if not entry.compare(ll_chars, start, length):
+ # collision! hopefully rare
return self._create_string_wrapped(start, i, nonascii)
if context is not None:
context.cache_hits += 1
return entry.w_uni
- def decode_key(self, i, currmap):
- newmap = self._decode_key(i, currmap)
+ def decode_key_map(self, i, currmap):
+ """ Given the current map currmap of an object, decode the next key at
+ position i. This returns the new map of the object. """
+ newmap = self._decode_key_map(i, currmap)
currmap.observe_transition(newmap, self.startmap)
return newmap
- def _decode_key(self, i, currmap):
+ def _decode_key_map(self, i, currmap):
ll_chars = self.ll_chars
+ # first try to see whether we happen to find currmap.single_nextmap
nextmap = currmap.fast_path_key_parse(self, i)
if nextmap is not None:
return nextmap
@@ -616,6 +608,8 @@
return currmap.get_next(w_key, self.s, start, self.pos, self.startmap)
def _decode_key_string(self, i):
+ """ decode key at position i as a string. Key strings are always
+ cached, since they repeat a lot. """
ll_chars = self.ll_chars
start = i
@@ -636,14 +630,13 @@
entry = self.cache[strhash]
except KeyError:
w_res = self._create_string_wrapped(start, i, nonascii)
- entry = WrappedCacheEntry(
+ entry = CacheEntry(
self.getslice(start, start + length), w_res)
self.cache[strhash] = entry
return w_res
- if not _compare_cache_entry(self.space, entry.repr, ll_chars, start, length):
- # hopefully rare
+ if not entry.compare(ll_chars, start, length):
+ # collision! hopefully rare
w_res = self._create_string_wrapped(start, i, nonascii)
- print w_res
else:
w_res = entry.w_uni
return w_res
@@ -656,11 +649,26 @@
i += 1
return self._decode_key_string(i)
-class WrappedCacheEntry(object):
+class CacheEntry(object):
+ """ A cache entry, bunding the encoded version of a string, and its wrapped
+ decoded variant. """
def __init__(self, repr, w_uni):
+ # repr is the escaped string
self.repr = repr
+ # uni is the wrapped decoded string
self.w_uni = w_uni
+ def compare(self, ll_chars, start, length):
+ """ Check whether self.repr occurs at ll_chars[start:start+length] """
+ if length != len(self.repr):
+ return False
+ index = start
+ for c in self.repr:
+ if not ll_chars[index] == c:
+ return False
+ index += 1
+ return True
+
class MapBase(object):
""" A map implementation to speed up parsing of json dicts, and to
@@ -745,6 +753,9 @@
# but then it also contains .single_nextmap
self.all_next = None # later dict {key: nextmap}
+ # keep some statistics about every map: how often it was instantiated
+ # and how many non-blocked leaves the map transition tree has, starting
+ # from self
self.instantiation_count = 0
self.number_of_leaves = 1
@@ -783,13 +794,15 @@
next = self._make_next_map(w_key, string[start:stop])
self.all_next[w_key] = next
- # fix number_of_leaves
+ # one new leaf has been created
self.change_number_of_leaves(1)
terminator.register_potential_fringe(next)
return next
def change_number_of_leaves(self, difference):
+ if not difference:
+ return
parent = self
while isinstance(parent, JSONMap):
parent.number_of_leaves += difference
@@ -797,6 +810,9 @@
parent.number_of_leaves += difference # terminator
def fast_path_key_parse(self, decoder, position):
+ """ Fast path when parsing the next key: We speculate that we will
+ always see a commonly seen next key, and use strcmp (implemented in
+ key_repr_cmp) to check whether that is the case. """
single_nextmap = self.single_nextmap
if single_nextmap:
ll_chars = decoder.ll_chars
@@ -810,7 +826,7 @@
This does a few things, including updating the self size estimate with
the knowledge that one object transitioned from self to newmap.
also it potentially decides that self should move to state USEFUL."""
- self.instantiation_count += 1
+ newmap.instantiation_count += 1
if isinstance(self, JSONMap) and self.state == MapBase.FRINGE:
if self.is_useful():
self.mark_useful(terminator)
@@ -915,6 +931,10 @@
self.keys_in_order = None
self.strategy_instance = None
+ def __repr__(self):
+ return "<JSONMap key_repr=%s #instantiation=%s #leaves=%s prev=%r>" % (
+ self.key_repr, self.instantiation_count, self.number_of_leaves, self.prev)
+
def _get_terminator(self): # only for _check_invariants
while isinstance(self, JSONMap):
self = self.prev
@@ -961,6 +981,8 @@
self.state = MapBase.USEFUL
if was_fringe:
terminator.remove_from_fringe(self)
+ # find the most commonly instantiated child, store it into
+ # single_nextmap and mark it useful, recursively
maxchild = self.single_nextmap
if self.all_next is not None:
for child in self.all_next.itervalues():
@@ -975,6 +997,7 @@
self.single_nextmap = maxchild
def mark_blocked(self, terminator):
+ """ mark self and recursively all its children as blocked."""
was_fringe = self.state == MapBase.FRINGE
self.state = MapBase.BLOCKED
if was_fringe:
@@ -1059,7 +1082,7 @@
if self.all_next is None:
l = int(self.single_nextmap is not None)
else:
- l = len(self.all_next) + 1
+ l = len(self.all_next)
extra = ""
if self.decoded_strings:
extra = "\\n%s/%s (%s%%)" % (self.cache_hits, self.decoded_strings, self.cache_hits/float(self.decoded_strings))
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -60,23 +60,23 @@
assert m3.get_index(w_c) == 1
assert m3.get_index(w_a) == -1
- def test_decode_key(self):
+ def test_decode_key_map(self):
m = Terminator(self.space)
m_diff = Terminator(self.space)
for s1 in ["abc", "1001" * 10, u"ä".encode("utf-8")]:
s = ' "%s" "%s" "%s"' % (s1, s1, s1)
dec = JSONDecoder(self.space, s)
assert dec.pos == 0
- m1 = dec.decode_key(dec.skip_whitespace(0), m)
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
assert m1.w_key._utf8 == s1
assert m1.key_repr == '"%s"' % s1
# check caching on w_key level
- m2 = dec.decode_key(dec.skip_whitespace(dec.pos), m_diff)
+ m2 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
assert m1.w_key is m2.w_key
# check caching on map level
- m3 = dec.decode_key(dec.skip_whitespace(dec.pos), m_diff)
+ m3 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
assert m3 is m2
dec.close()
@@ -150,6 +150,14 @@
assert m2.number_of_leaves == 3
assert m5.number_of_leaves == 1
+ def test_number_of_leaves_after_mark_blocked(self):
+ w_x = self.space.newutf8("x", 1)
+ base, m1, m2, m3, m4 = self._make_some_maps()
+ m5 = m2.get_next(w_x, 'x"', 0, 2, base)
+ assert base.number_of_leaves == 3
+ m2.mark_blocked(base)
+ assert base.number_of_leaves == 1
+
def test_mark_useful_cleans_fringe(self):
base, m1, m2, m3, m4 = self._make_some_maps()
base.instantiation_count = 6
@@ -225,6 +233,19 @@
m2.mark_blocked(base)
assert base.number_of_leaves == 1
+ def test_instatiation_count(self):
+ m = Terminator(self.space)
+ dec = JSONDecoder(self.space, '"abc" "def"')
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+ m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+ m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+ m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+
+ assert m1.instantiation_count == 3
+ assert m2.instantiation_count == 2
+ dec.close()
+
class AppTest(object):
spaceconfig = {"objspace.usemodules._pypyjson": True}
More information about the pypy-commit
mailing list