[pypy-commit] pypy py3.5-marshal3: in-progress
arigo
pypy.commits at gmail.com
Sat Aug 27 13:06:55 EDT 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5-marshal3
Changeset: r86610:fec7aa29e819
Date: 2016-08-27 19:06 +0200
http://bitbucket.org/pypy/pypy/changeset/fec7aa29e819/
Log: in-progress
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -837,13 +837,13 @@
self.interned_strings.set(u, w_s1)
return w_s1
- def is_interned_str(self, s):
+ def get_interned_str(self, s):
"""Assumes an identifier (utf-8 encoded str)"""
# interface for marshal_impl
if not we_are_translated():
assert type(s) is str
u = s.decode('utf-8')
- return self.interned_strings.get(u) is not None
+ return self.interned_strings.get(u) # may be None
def descr_self_interp_w(self, RequiredClass, w_obj):
if not isinstance(w_obj, RequiredClass):
diff --git a/pypy/module/imp/importing.py b/pypy/module/imp/importing.py
--- a/pypy/module/imp/importing.py
+++ b/pypy/module/imp/importing.py
@@ -228,7 +228,7 @@
# CPython + 7 = default_magic -- used by PyPy (incompatible!)
#
from pypy.interpreter.pycode import default_magic
-MARSHAL_VERSION_FOR_PYC = 3
+MARSHAL_VERSION_FOR_PYC = 4
def get_pyc_magic(space):
return default_magic
diff --git a/pypy/module/marshal/interp_marshal.py b/pypy/module/marshal/interp_marshal.py
--- a/pypy/module/marshal/interp_marshal.py
+++ b/pypy/module/marshal/interp_marshal.py
@@ -11,14 +11,14 @@
# objects, are supported. Version 3 of this protocol properly
# supports circular links and sharing. The previous version is called
# "2", like in Python 2.7, although it is not always compatible
-# between CPython 2.7 and CPython 3.x.
+# between CPython 2.7 and CPython 3.4.
#
# XXX: before py3k, there was logic to do efficiently dump()/load() on
# a file object. The corresponding logic is gone from CPython 3.x, so
# I don't feel bad about killing it here too.
#
-Py_MARSHAL_VERSION = 3
+Py_MARSHAL_VERSION = 4
@unwrap_spec(w_version=WrappedDefault(Py_MARSHAL_VERSION))
@@ -127,7 +127,15 @@
## self.put = putfunc
self.writer = writer
self.version = version
- self.stringtable = {}
+ self.all_refs = {}
+ # all_refs = {w_obj: index} for all w_obj that are of a
+ # "reasonably sharable" type. CPython checks the refcount of
+ # any object to know if it is sharable, independently of its
+ # type. We can't do that. We could do a two-pass marshaller.
+ # For now we simply add to this list all objects that marshal to
+ # more than a few fixed-sized bytes, minus ones like code
+ # objects that never appear more than once except in complete
+ # corner cases.
## currently we cannot use a put that is a bound method
## from outside. Same holds for get.
@@ -207,10 +215,13 @@
rstackovf.check_stack_overflow()
self._overflow()
- def put_tuple_w(self, typecode, lst_w):
+ def put_tuple_w(self, typecode, lst_w, single_byte_size=False):
self.start(typecode)
lng = len(lst_w)
- self.put_int(lng)
+ if single_byte_size:
+ self.put(chr(lng))
+ else:
+ self.put_int(lng)
idx = 0
while idx < lng:
w_obj = lst_w[idx]
@@ -301,18 +312,35 @@
def invalid_typecode(space, u, tc):
- u.raise_exc("bad marshal data (unknown type code)")
+ u.raise_exc("bad marshal data (unknown type code %d)" % (ord(tc),))
+def _make_unmarshall_and_save_ref(func):
+ def unmarshall_save_ref(space, u, tc):
+ index = len(u.refs_w)
+ u.refs_w.append(None)
+ w_obj = func(space, u, tc)
+ u.refs_w[index] = w_obj
+ return w_obj
+ return unmarshall_save_ref
-class Unmarshaller(_Base):
+def _make_unmarshaller_dispatch():
_dispatch = [invalid_typecode] * 256
for tc, func in get_unmarshallers():
_dispatch[ord(tc)] = func
+ for tc, func in get_unmarshallers():
+ if tc < '\x80' and _dispatch[ord(tc) + 0x80] is invalid_typecode:
+ _dispatch[ord(tc) + 0x80] = _make_unmarshall_and_save_ref(func)
+ return _dispatch
+
+
+class Unmarshaller(_Base):
+ _dispatch = _make_unmarshaller_dispatch()
def __init__(self, space, reader):
self.space = space
self.reader = reader
+ self.refs_w = []
def get(self, n):
assert n >= 0
@@ -322,6 +350,10 @@
# the [0] is used to convince the annotator to return a char
return self.get(1)[0]
+ def save_ref(self, typecode, w_obj):
+ if typecode >= '\x80':
+ self.refs_w.append(w_obj)
+
def atom_str(self, typecode):
self.start(typecode)
lng = self.get_lng()
@@ -392,8 +424,11 @@
self._overflow()
# inlined version to save a recursion level
- def get_tuple_w(self):
- lng = self.get_lng()
+ def get_tuple_w(self, single_byte_size=False):
+ if single_byte_size:
+ lng = ord(self.get1())
+ else:
+ lng = self.get_lng()
res_w = [None] * lng
idx = 0
space = self.space
@@ -409,9 +444,6 @@
raise oefmt(space.w_TypeError, "NULL object in marshal data")
return res_w
- def get_list_w(self):
- return self.get_tuple_w()[:]
-
def _overflow(self):
self.raise_exc('object too deeply nested to unmarshal')
diff --git a/pypy/module/marshal/test/test_marshal.py b/pypy/module/marshal/test/test_marshal.py
--- a/pypy/module/marshal/test/test_marshal.py
+++ b/pypy/module/marshal/test/test_marshal.py
@@ -197,7 +197,7 @@
def test_bad_typecode(self):
import marshal
exc = raises(ValueError, marshal.loads, bytes([1]))
- assert str(exc.value) == "bad marshal data (unknown type code)"
+ assert str(exc.value).startswith("bad marshal data (unknown type code")
def test_bad_data(self):
# If you have sufficiently little memory, the line at the end of the
diff --git a/pypy/module/marshal/test/test_marshalimpl.py b/pypy/module/marshal/test/test_marshalimpl.py
--- a/pypy/module/marshal/test/test_marshalimpl.py
+++ b/pypy/module/marshal/test/test_marshalimpl.py
@@ -34,10 +34,21 @@
def test_shared_string(self):
x = "hello, "
x += "world"
- s = marshal.dumps((x, x))
- assert s.count(x) == 1
- y = marshal.loads(s)
- assert y == (x, x)
+ xl = 256
+ xl **= 100
+ for version in [2, 3]:
+ s = marshal.dumps((x, x), version)
+ assert s.count(x) == 2 if version < 3 else 1
+ y = marshal.loads(s)
+ assert y == (x, x)
+ #
+ s = marshal.dumps((xl, xl), version)
+ if version < 3:
+ assert 200 < len(s) < 250
+ else:
+ assert 100 < len(s) < 125
+ yl = marshal.loads(s)
+ assert yl == (xl, xl)
class AppTestMarshalSmallLong(AppTestMarshalMore):
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -2,6 +2,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rstruct import ieee
from rpython.rlib.unroll import unrolling_iterable
+from rpython.rlib import objectmodel
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.special import Ellipsis
@@ -46,12 +47,14 @@
TYPE_SET = '<'
TYPE_FROZENSET = '>'
FLAG_REF = 0x80 # bit added to mean "add obj to index"
+FLAG_DONE = '\x00'
-TYPE_ASCII = 'a' # never generated so far
-TYPE_ASCII_INTERNED = 'A' # never generated so far
+# the following typecodes have been added in version 4.
+TYPE_ASCII = 'a' # never generated so far by pypy
+TYPE_ASCII_INTERNED = 'A' # never generated so far by pypy
TYPE_SMALL_TUPLE = ')'
-TYPE_SHORT_ASCII = 'z' # never generated so far
-TYPE_SHORT_ASCII_INTERNED = 'Z' # never generated so far
+TYPE_SHORT_ASCII = 'z' # never generated so far by pypy
+TYPE_SHORT_ASCII_INTERNED = 'Z' # never generated so far by pypy
_marshallers = []
@@ -63,12 +66,33 @@
return f
return _decorator
-def unmarshaller(tc):
+def unmarshaller(tc, save_ref=False):
def _decorator(f):
+ assert tc < '\x80'
_unmarshallers.append((tc, f))
+ if save_ref:
+ tcref = chr(ord(tc) + 0x80)
+ _unmarshallers.append((tcref, f))
return f
return _decorator
+def write_ref(typecode, w_obj, m):
+ if m.version < 3:
+ return typecode # not writing object references
+ try:
+ index = m.all_refs[w_obj]
+ except KeyError:
+ # we don't support long indices
+ index = len(m.all_refs)
+ if index >= 0x7fffffff:
+ return typecode
+ m.all_refs[w_obj] = index
+ return chr(ord(typecode) + FLAG_REF)
+ else:
+ # write the reference index to the stream
+ m.atom_int(TYPE_REF, index)
+ return FLAG_DONE
+
def marshal(space, w_obj, m):
# _marshallers_unroll is defined at the end of the file
for type, func in _marshallers_unroll:
@@ -82,10 +106,11 @@
s = space.readbuf_w(w_obj)
except OperationError as e:
if e.match(space, space.w_TypeError):
- raise oefmt(space.w_ValueError, "cannot marshal '%T' object",
- w_obj)
+ raise oefmt(space.w_ValueError, "unmarshallable object")
raise
- m.atom_str(TYPE_STRING, s.as_str())
+ typecode = write_ref(TYPE_STRING, w_obj, m)
+ if typecode != FLAG_DONE:
+ m.atom_str(typecode, s.as_str())
def get_unmarshallers():
return _unmarshallers
@@ -116,7 +141,7 @@
@marshaller(W_TypeObject)
def marshal_stopiter(space, w_type, m):
if not space.is_w(w_type, space.w_StopIteration):
- raise oefmt(space.w_ValueError, "cannot marshal type object")
+ raise oefmt(space.w_ValueError, "unmarshallable object")
m.atom(TYPE_STOPITER)
@unmarshaller(TYPE_STOPITER)
@@ -137,7 +162,7 @@
def marshal_int(space, w_int, m):
y = w_int.intval >> 31
if y and y != -1:
- _marshal_bigint(space, space.bigint_w(w_int), m)
+ marshal_long(space, w_int, m)
else:
m.atom_int(TYPE_INT, w_int.intval)
@@ -148,13 +173,14 @@
@marshaller(W_AbstractLongObject)
def marshal_long(space, w_long, m):
- _marshal_bigint(space, w_long.asbigint(), m)
-
-def _marshal_bigint(space, num, m):
from rpython.rlib.rarithmetic import r_ulonglong
- m.start(TYPE_LONG)
+ typecode = write_ref(TYPE_LONG, w_long, m)
+ if typecode == FLAG_DONE:
+ return
+ m.start(typecode)
SHIFT = 15
MASK = (1 << SHIFT) - 1
+ num = space.bigint_w(w_long)
sign = num.sign
num = num.abs()
total_length = (num.bit_length() + (SHIFT - 1)) / SHIFT
@@ -242,8 +268,10 @@
@marshaller(W_BytesObject)
def marshal_bytes(space, w_str, m):
- s = w_str.unwrap(space)
- m.atom_str(TYPE_STRING, s)
+ typecode = write_ref(TYPE_STRING, w_str, m)
+ if typecode != FLAG_DONE:
+ s = space.bytes_w(w_str)
+ m.atom_str(typecode, s)
@unmarshaller(TYPE_STRING)
def unmarshal_bytes(space, u, tc):
@@ -253,40 +281,62 @@
@marshaller(W_AbstractTupleObject)
def marshal_tuple(space, w_tuple, m):
items = w_tuple.tolist()
- m.put_tuple_w(TYPE_TUPLE, items)
+ if m.version >= 4 and len(items) < 256:
+ typecode = TYPE_SMALL_TUPLE
+ single_byte_size = True
+ else:
+ typecode = TYPE_TUPLE
+ single_byte_size = False
+ typecode = write_ref(typecode, w_tuple, m)
+ if typecode != FLAG_DONE:
+ m.put_tuple_w(typecode, items, single_byte_size=single_byte_size)
@unmarshaller(TYPE_TUPLE)
def unmarshal_tuple(space, u, tc):
items_w = u.get_tuple_w()
return space.newtuple(items_w)
+ at unmarshaller(TYPE_SMALL_TUPLE)
+def unmarshal_tuple(space, u, tc):
+ items_w = u.get_tuple_w(single_byte_size=True)
+ return space.newtuple(items_w)
+
@marshaller(W_ListObject)
def marshal_list(space, w_list, m):
- items = w_list.getitems()[:]
- m.put_tuple_w(TYPE_LIST, items)
+ typecode = write_ref(TYPE_LIST, w_list, m)
+ if typecode != FLAG_DONE:
+ items = w_list.getitems()[:]
+ m.put_tuple_w(typecode, items)
- at unmarshaller(TYPE_LIST)
+ at unmarshaller(TYPE_LIST, save_ref=True)
def unmarshal_list(space, u, tc):
- items_w = u.get_list_w()
- return space.newlist(items_w)
+ w_obj = space.newlist([])
+ u.save_ref(tc, w_obj)
+ for w_item in u.get_tuple_w():
+ w_obj.append(w_item)
+ return w_obj
@marshaller(W_DictMultiObject)
def marshal_dict(space, w_dict, m):
- m.start(TYPE_DICT)
+ typecode = write_ref(TYPE_DICT, w_dict, m)
+ if typecode == FLAG_DONE:
+ return
+ m.start(typecode)
for w_tuple in w_dict.items():
w_key, w_value = space.fixedview(w_tuple, 2)
m.put_w_obj(w_key)
m.put_w_obj(w_value)
m.atom(TYPE_NULL)
- at unmarshaller(TYPE_DICT)
+ at unmarshaller(TYPE_DICT, save_ref=True)
def unmarshal_dict(space, u, tc):
# since primitive lists are not optimized and we don't know
# the dict size in advance, use the dict's setitem instead
# of building a list of tuples.
w_dic = space.newdict()
+ u.save_ref(tc, w_dic)
while 1:
w_key = u.get_w_obj(allow_null=True)
if w_key is None:
@@ -308,6 +358,7 @@
@marshaller(PyCode)
def marshal_pycode(space, w_pycode, m):
+ # (no attempt at using write_ref here, there is little point imho)
m.start(TYPE_CODE)
# see pypy.interpreter.pycode for the layout
x = space.interp_w(PyCode, w_pycode)
@@ -353,8 +404,10 @@
lng = u.atom_lng(tc)
return [unmarshal_str(u) for i in range(lng)]
- at unmarshaller(TYPE_CODE)
+ at unmarshaller(TYPE_CODE, save_ref=True)
def unmarshal_pycode(space, u, tc):
+ w_codeobj = objectmodel.instantiate(PyCode)
+ u.save_ref(tc, w_codeobj)
argcount = u.get_int()
kwonlyargcount = u.get_int()
nlocals = u.get_int()
@@ -372,50 +425,85 @@
name = unmarshal_str(u)
firstlineno = u.get_int()
lnotab = unmarshal_str(u)
- return PyCode(space, argcount, kwonlyargcount, nlocals, stacksize, flags,
+ PyCode.__init__(w_codeobj,
+ space, argcount, kwonlyargcount, nlocals, stacksize, flags,
code, consts_w[:], names, varnames, filename,
name, firstlineno, lnotab, freevars, cellvars)
+ return w_codeobj
@marshaller(W_UnicodeObject)
def marshal_unicode(space, w_unicode, m):
s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode),
allow_surrogates=True)
- m.atom_str(TYPE_UNICODE, s)
+ if m.version >= 3:
+ w_interned = space.get_interned_str(s)
+ else:
+ w_interned = None
+ if w_interned is not None:
+ w_unicode = w_interned # use the interned W_UnicodeObject
+ typecode = TYPE_INTERNED # as a key for u.all_refs
+ else:
+ typecode = TYPE_UNICODE
+ typecode = write_ref(typecode, w_unicode, m)
+ if typecode != FLAG_DONE:
+ m.atom_str(typecode, s)
@unmarshaller(TYPE_UNICODE)
def unmarshal_unicode(space, u, tc):
- return space.wrap(unicodehelper.decode_utf8(space, u.get_str(),
- allow_surrogates=True))
+ uc = unicodehelper.decode_utf8(space, u.get_str(), allow_surrogates=True)
+ return space.newunicode(uc)
+
+ at unmarshaller(TYPE_INTERNED)
+def unmarshal_bytes(space, u, tc):
+ return space.new_interned_str(u.get_str())
+
@marshaller(W_SetObject)
def marshal_set(space, w_set, m):
- lis_w = space.fixedview(w_set)
- m.put_tuple_w(TYPE_SET, lis_w)
+ typecode = write_ref(TYPE_SET, w_set, m)
+ if typecode != FLAG_DONE:
+ lis_w = space.fixedview(w_set)
+ m.put_tuple_w(typecode, lis_w)
- at unmarshaller(TYPE_SET)
+ at unmarshaller(TYPE_SET, save_ref=True)
def unmarshal_set(space, u, tc):
- return unmarshal_set_frozenset(space, u, tc)
+ w_set = space.call_function(space.w_set)
+ u.save_ref(tc, w_set)
+ _unmarshal_set_frozenset(space, u, w_set)
+ return w_set
@marshaller(W_FrozensetObject)
def marshal_frozenset(space, w_frozenset, m):
- lis_w = space.fixedview(w_frozenset)
- m.put_tuple_w(TYPE_FROZENSET, lis_w)
+ typecode = write_ref(TYPE_FROZENSET, w_frozenset, m)
+ if typecode != FLAG_DONE:
+ lis_w = space.fixedview(w_frozenset)
+ m.put_tuple_w(typecode, lis_w)
-def unmarshal_set_frozenset(space, u, tc):
+def _unmarshal_set_frozenset(space, u, w_set):
lng = u.get_lng()
- w_set = space.call_function(space.w_set)
for i in xrange(lng):
w_obj = u.get_w_obj()
space.call_method(w_set, "add", w_obj)
- if tc == TYPE_FROZENSET:
- w_set = space.call_function(space.w_frozenset, w_set)
- return w_set
@unmarshaller(TYPE_FROZENSET)
def unmarshal_frozenset(space, u, tc):
- return unmarshal_set_frozenset(space, u, tc)
+ w_set = space.call_function(space.w_set)
+ _unmarshal_set_frozenset(space, u, w_set)
+ return space.call_function(space.w_frozenset, w_set)
+
+
+ at unmarshaller(TYPE_REF)
+def unmarshal_ref(space, u, tc):
+ index = u.get_lng()
+ if 0 <= index < len(u.refs_w):
+ w_obj = u.refs_w[index]
+ else:
+ w_obj = None
+ if w_obj is None:
+ raise oefmt(space.w_ValueError, "bad marshal data (invalid reference)")
+ return w_obj
_marshallers_unroll = unrolling_iterable(_marshallers)
More information about the pypy-commit
mailing list