[pypy-commit] pypy json-decoder-maps: cleanup instantiation_count and number_of_leaves, some comments, some renaming

cfbolz pypy.commits at gmail.com
Wed Jun 5 16:00:01 EDT 2019


Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: json-decoder-maps
Changeset: r96760:ec6df04c89ba
Date: 2019-06-05 18:49 +0200
http://bitbucket.org/pypy/pypy/changeset/ec6df04c89ba/

Log:	cleanup instantiation_count and number_of_leaves, some comments,
	some renaming

diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -24,17 +24,6 @@
         return 0.0
     return x * NEG_POW_10[exp]
 
-def _compare_cache_entry(space, res, ll_chars, start, length):
-    if length != len(res):
-        return False
-    index = start
-    for c in res:
-        x = ord(c)
-        if not ll_chars[index] == chr(x):
-            return False
-        index += 1
-    return True
-
 
 class IntCache(object):
     START = -10
@@ -84,6 +73,9 @@
         self.pos = 0
         self.intcache = space.fromcache(IntCache)
 
+        # two caches, one for keys, one for general strings. they both have the
+        # form {hash-as-int: CacheEntry} and they don't deal with
+        # collisions at all. For every hash there is simply one string stored.
         self.cache = {}
         self.cache_wrapped = {}
 
@@ -339,7 +331,7 @@
         currmap = self.startmap
         while True:
             # parse a key: value
-            currmap = self.decode_key(i, currmap)
+            currmap = self.decode_key_map(i, currmap)
             i = self.skip_whitespace(self.pos)
             ch = self.ll_chars[i]
             if ch != ':':
@@ -359,11 +351,9 @@
                 self.pos = i
                 self.scratch.append(values_w)  # can reuse next time
                 if currmap.is_state_blocked():
-                    currmap.instantiation_count += 1
                     dict_w = self._switch_to_dict(currmap, values_w, nextindex)
                     return self._create_dict(dict_w)
                 values_w = values_w[:nextindex]
-                currmap.instantiation_count += 1
                 w_res = self._create_dict_map(values_w, currmap)
                 if not currmap.is_state_useful():
                     self.unclear_objects.append(w_res)
@@ -371,7 +361,6 @@
             elif ch == ',':
                 i = self.skip_whitespace(i)
                 if currmap.is_state_blocked():
-                    currmap.instantiation_count += 1
                     self.scratch.append(values_w)  # can reuse next time
                     dict_w = self._switch_to_dict(currmap, values_w, nextindex)
                     return self.decode_object_dict(i, start, dict_w)
@@ -582,27 +571,30 @@
             if ((context is not None and
                         context.decoded_strings < self.STRING_CACHE_EVALUATION_SIZE) or
                     strhash in self.lru_cache):
-                entry = WrappedCacheEntry(
+                entry = CacheEntry(
                         self.getslice(start, start + length), w_res)
                 self.cache_wrapped[strhash] = entry
             else:
                 self.lru_cache[self.lru_index] = strhash
                 self.lru_index = (self.lru_index + 1) & self.LRU_MASK
             return w_res
-        if not _compare_cache_entry(self.space, entry.repr, ll_chars, start, length):
-            # hopefully rare
+        if not entry.compare(ll_chars, start, length):
+            # collision! hopefully rare
             return self._create_string_wrapped(start, i, nonascii)
         if context is not None:
             context.cache_hits += 1
         return entry.w_uni
 
-    def decode_key(self, i, currmap):
-        newmap = self._decode_key(i, currmap)
+    def decode_key_map(self, i, currmap):
+        """ Given the current map currmap of an object, decode the next key at
+        position i. This returns the new map of the object. """
+        newmap = self._decode_key_map(i, currmap)
         currmap.observe_transition(newmap, self.startmap)
         return newmap
 
-    def _decode_key(self, i, currmap):
+    def _decode_key_map(self, i, currmap):
         ll_chars = self.ll_chars
+        # first try to see whether we happen to find currmap.single_nextmap
         nextmap = currmap.fast_path_key_parse(self, i)
         if nextmap is not None:
             return nextmap
@@ -616,6 +608,8 @@
         return currmap.get_next(w_key, self.s, start, self.pos, self.startmap)
 
     def _decode_key_string(self, i):
+        """ decode key at position i as a string. Key strings are always
+        cached, since they repeat a lot. """
         ll_chars = self.ll_chars
         start = i
 
@@ -636,14 +630,13 @@
             entry = self.cache[strhash]
         except KeyError:
             w_res = self._create_string_wrapped(start, i, nonascii)
-            entry = WrappedCacheEntry(
+            entry = CacheEntry(
                     self.getslice(start, start + length), w_res)
             self.cache[strhash] = entry
             return w_res
-        if not _compare_cache_entry(self.space, entry.repr, ll_chars, start, length):
-            # hopefully rare
+        if not entry.compare(ll_chars, start, length):
+            # collision! hopefully rare
             w_res = self._create_string_wrapped(start, i, nonascii)
-            print w_res
         else:
             w_res = entry.w_uni
         return w_res
@@ -656,11 +649,26 @@
         i += 1
         return self._decode_key_string(i)
 
-class WrappedCacheEntry(object):
+class CacheEntry(object):
+    """ A cache entry, bunding the encoded version of a string, and its wrapped
+    decoded variant. """
     def __init__(self, repr, w_uni):
+        # repr is the escaped string
         self.repr = repr
+        # uni is the wrapped decoded string
         self.w_uni = w_uni
 
+    def compare(self, ll_chars, start, length):
+        """ Check whether self.repr occurs at ll_chars[start:start+length] """
+        if length != len(self.repr):
+            return False
+        index = start
+        for c in self.repr:
+            if not ll_chars[index] == c:
+                return False
+            index += 1
+        return True
+
 
 class MapBase(object):
     """ A map implementation to speed up parsing of json dicts, and to
@@ -745,6 +753,9 @@
         # but then it also contains .single_nextmap
         self.all_next = None # later dict {key: nextmap}
 
+        # keep some statistics about every map: how often it was instantiated
+        # and how many non-blocked leaves the map transition tree has, starting
+        # from self
         self.instantiation_count = 0
         self.number_of_leaves = 1
 
@@ -783,13 +794,15 @@
             next = self._make_next_map(w_key, string[start:stop])
             self.all_next[w_key] = next
 
-            # fix number_of_leaves
+            # one new leaf has been created
             self.change_number_of_leaves(1)
 
         terminator.register_potential_fringe(next)
         return next
 
     def change_number_of_leaves(self, difference):
+        if not difference:
+            return
         parent = self
         while isinstance(parent, JSONMap):
             parent.number_of_leaves += difference
@@ -797,6 +810,9 @@
         parent.number_of_leaves += difference # terminator
 
     def fast_path_key_parse(self, decoder, position):
+        """ Fast path when parsing the next key: We speculate that we will
+        always see a commonly seen next key, and use strcmp (implemented in
+        key_repr_cmp) to check whether that is the case. """
         single_nextmap = self.single_nextmap
         if single_nextmap:
             ll_chars = decoder.ll_chars
@@ -810,7 +826,7 @@
         This does a few things, including updating the self size estimate with
         the knowledge that one object transitioned from self to newmap.
         also it potentially decides that self should move to state USEFUL."""
-        self.instantiation_count += 1
+        newmap.instantiation_count += 1
         if isinstance(self, JSONMap) and self.state == MapBase.FRINGE:
             if self.is_useful():
                 self.mark_useful(terminator)
@@ -915,6 +931,10 @@
         self.keys_in_order = None
         self.strategy_instance = None
 
+    def __repr__(self):
+        return "<JSONMap key_repr=%s #instantiation=%s #leaves=%s prev=%r>" % (
+                self.key_repr, self.instantiation_count, self.number_of_leaves, self.prev)
+
     def _get_terminator(self): # only for _check_invariants
         while isinstance(self, JSONMap):
             self = self.prev
@@ -961,6 +981,8 @@
         self.state = MapBase.USEFUL
         if was_fringe:
             terminator.remove_from_fringe(self)
+        # find the most commonly instantiated child, store it into
+        # single_nextmap and mark it useful, recursively
         maxchild = self.single_nextmap
         if self.all_next is not None:
             for child in self.all_next.itervalues():
@@ -975,6 +997,7 @@
                 self.single_nextmap = maxchild
 
     def mark_blocked(self, terminator):
+        """ mark self and recursively all its children as blocked."""
         was_fringe = self.state == MapBase.FRINGE
         self.state = MapBase.BLOCKED
         if was_fringe:
@@ -1059,7 +1082,7 @@
         if self.all_next is None:
             l = int(self.single_nextmap is not None)
         else:
-            l = len(self.all_next) + 1
+            l = len(self.all_next)
         extra = ""
         if self.decoded_strings:
             extra = "\\n%s/%s (%s%%)" % (self.cache_hits, self.decoded_strings, self.cache_hits/float(self.decoded_strings))
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -60,23 +60,23 @@
         assert m3.get_index(w_c) == 1
         assert m3.get_index(w_a) == -1
 
-    def test_decode_key(self):
+    def test_decode_key_map(self):
         m = Terminator(self.space)
         m_diff = Terminator(self.space)
         for s1 in ["abc", "1001" * 10, u"ä".encode("utf-8")]:
             s = ' "%s"   "%s" "%s"' % (s1, s1, s1)
             dec = JSONDecoder(self.space, s)
             assert dec.pos == 0
-            m1 = dec.decode_key(dec.skip_whitespace(0), m)
+            m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
             assert m1.w_key._utf8 == s1
             assert m1.key_repr == '"%s"' % s1
 
             # check caching on w_key level
-            m2 = dec.decode_key(dec.skip_whitespace(dec.pos), m_diff)
+            m2 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
             assert m1.w_key is m2.w_key
 
             # check caching on map level
-            m3 = dec.decode_key(dec.skip_whitespace(dec.pos), m_diff)
+            m3 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff)
             assert m3 is m2
             dec.close()
 
@@ -150,6 +150,14 @@
         assert m2.number_of_leaves == 3
         assert m5.number_of_leaves == 1
 
+    def test_number_of_leaves_after_mark_blocked(self):
+        w_x = self.space.newutf8("x", 1)
+        base, m1, m2, m3, m4 = self._make_some_maps()
+        m5 = m2.get_next(w_x, 'x"', 0, 2, base)
+        assert base.number_of_leaves == 3
+        m2.mark_blocked(base)
+        assert base.number_of_leaves == 1
+
     def test_mark_useful_cleans_fringe(self):
         base, m1, m2, m3, m4 = self._make_some_maps()
         base.instantiation_count = 6
@@ -225,6 +233,19 @@
         m2.mark_blocked(base)
         assert base.number_of_leaves == 1
 
+    def test_instatiation_count(self):
+        m = Terminator(self.space)
+        dec = JSONDecoder(self.space, '"abc" "def"')
+        m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+        m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+        m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+        m2 = dec.decode_key_map(dec.skip_whitespace(6), m1)
+        m1 = dec.decode_key_map(dec.skip_whitespace(0), m)
+
+        assert m1.instantiation_count == 3
+        assert m2.instantiation_count == 2
+        dec.close()
+
 
 class AppTest(object):
     spaceconfig = {"objspace.usemodules._pypyjson": True}


More information about the pypy-commit mailing list