[pypy-commit] pypy py3k: Implement array._array_reconstructor: used with pickle protocol 3, allows

Mon Nov 12 01:37:17 CET 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r58830:9f61bab5651f
Date: 2012-11-11 22:51 +0100
http://bitbucket.org/pypy/pypy/changeset/9f61bab5651f/

Log:	Implement array._array_reconstructor: used with pickle protocol 3,
	allows a machine-independent pickling of array objects.

diff --git a/pypy/module/array/__init__.py b/pypy/module/array/__init__.py
--- a/pypy/module/array/__init__.py
+++ b/pypy/module/array/__init__.py
@@ -11,6 +11,7 @@
     interpleveldefs = {
         'array': 'interp_array.W_ArrayBase',
         'ArrayType': 'interp_array.W_ArrayBase',
+        '_array_reconstructor': 'reconstructor.array_reconstructor',
     }
 
     appleveldefs = {
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -72,7 +72,7 @@
 array_fromfile = SMM('fromfile', 3)
 
 array_buffer_info = SMM('buffer_info', 1)
-array_reduce = SMM('__reduce__', 1)
+array_reduce_ex = SMM('__reduce_ex__', 2)
 array_copy = SMM('__copy__', 1)
 array_byteswap = SMM('byteswap', 1)
 
@@ -124,6 +124,10 @@
         # hint for the annotator: track individual constant instances
         return True
 
+    def is_integer_type(self):
+        return self.unwrap == 'int_w' or self.unwrap == 'bigint_w'
+
+
 types = {
     'u': TypeCode(lltype.UniChar,     'unicode_w'),
     'b': TypeCode(rffi.SIGNEDCHAR,    'int_w', True, True),
@@ -639,17 +643,41 @@
         w_len = space.wrap(self.len)
         return space.newtuple([w_ptr, w_len])
 
-    def array_reduce__Array(space, self):
-        if self.len > 0:
-            w_s = array_tobytes__Array(space, self)
-            args = [space.wrap(mytype.typecode), w_s]
-        else:
-            args = [space.wrap(mytype.typecode)]
+    def array_reduce_ex__Array_ANY(space, self, w_protocol):
+        protocol = space.int_w(w_protocol)
         try:
-            dct = space.getattr(self, space.wrap('__dict__'))
+            w_dict = space.getattr(self, space.wrap('__dict__'))
         except OperationError:
-            dct = space.w_None
-        return space.newtuple([space.type(self), space.newtuple(args), dct])
+            w_dict = space.w_None
+        from pypy.module.array import reconstructor
+        mformat_code = reconstructor.typecode_to_mformat_code(mytype.typecode)
+        if protocol < 3 or mformat_code == reconstructor.UNKNOWN_FORMAT:
+            # Convert the array to a list if we got something weird
+            # (e.g., non-IEEE floats), or we are pickling the array
+            # using a Python 2.x compatible protocol.
+            #
+            # It is necessary to use a list representation for Python
+            # 2.x compatible pickle protocol, since Python 2's str
+            # objects are unpickled as unicode by Python 3. Thus it is
+            # impossible to make arrays unpicklable by Python 3 by
+            # using their memory representation, unless we resort to
+            # ugly hacks such as coercing unicode objects to bytes in
+            # array_reconstructor.
+            w_list = array_tolist__Array(space, self)
+            return space.newtuple([
+                    space.type(self),
+                    space.newtuple([space.wrap(mytype.typecode), w_list]),
+                    w_dict])
+            
+        w_bytes = array_tobytes__Array(space, self)
+        w_array_reconstructor = space.fromcache(State).w_array_reconstructor
+        return space.newtuple([
+                w_array_reconstructor,
+                space.newtuple([space.type(self),
+                                space.wrap(mytype.typecode),
+                                space.wrap(mformat_code),
+                                w_bytes]),
+                w_dict])
 
     def array_copy__Array(space, self):
         w_a = mytype.w_class(self.space)
@@ -709,4 +737,10 @@
 for mytype in types.values():
     make_array(mytype)
 
-register_all(locals(), globals())
+
+class State:
+    def __init__(self, space):
+        w_module = space.getbuiltinmodule('array')
+        self.w_array_reconstructor = space.getattr(
+            w_module, space.wrap("_array_reconstructor"))
+
diff --git a/pypy/module/array/reconstructor.py b/pypy/module/array/reconstructor.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/array/reconstructor.py
@@ -0,0 +1,193 @@
+# array._array_reconstructor is a special constructor used when
+# unpickling an array. It provides a portable way to rebuild an array
+# from its memory representation.
+import sys
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.argument import Arguments
+from pypy.rlib import runicode, rbigint
+from pypy.rlib.rstruct import ieee
+from pypy.rpython.lltypesystem import rffi
+
+from pypy.module.array import interp_array
+
+UNKNOWN_FORMAT = -1
+UNSIGNED_INT8 = 0
+SIGNED_INT8 = 1
+UNSIGNED_INT16_LE = 2
+UNSIGNED_INT16_BE = 3
+SIGNED_INT16_LE = 4
+SIGNED_INT16_BE = 5
+UNSIGNED_INT32_LE = 6
+UNSIGNED_INT32_BE = 7
+SIGNED_INT32_LE = 8
+SIGNED_INT32_BE = 9
+UNSIGNED_INT64_LE = 10
+UNSIGNED_INT64_BE = 11
+SIGNED_INT64_LE = 12
+SIGNED_INT64_BE = 13
+IEEE_754_FLOAT_LE = 14
+IEEE_754_FLOAT_BE = 15
+IEEE_754_DOUBLE_LE = 16
+IEEE_754_DOUBLE_BE = 17
+UTF16_LE = 18
+UTF16_BE = 19
+UTF32_LE = 20
+UTF32_BE = 21
+
+IS_BIG_ENDIAN = sys.byteorder == 'big'
+
+class MachineFormat(object):
+    def __init__(self, bytes, signed, big_endian):
+        self.bytes = bytes
+        self.signed = signed
+        self.big_endian = big_endian
+
+format_descriptors = {
+    UNSIGNED_INT8:      MachineFormat(1, False, False),
+    SIGNED_INT8:        MachineFormat(1, True, False),
+    UNSIGNED_INT16_LE:  MachineFormat(2, False, False),
+    UNSIGNED_INT16_BE:  MachineFormat(2, False, True),
+    SIGNED_INT16_LE:    MachineFormat(2, True, False),
+    SIGNED_INT16_BE:    MachineFormat(2, True, True),
+    UNSIGNED_INT32_LE:  MachineFormat(4, False, False),
+    UNSIGNED_INT32_BE:  MachineFormat(4, False, True),
+    SIGNED_INT32_LE:    MachineFormat(4, True, False),
+    SIGNED_INT32_BE:    MachineFormat(4, True, True),
+    UNSIGNED_INT64_LE:  MachineFormat(8, False, False),
+    UNSIGNED_INT64_BE:  MachineFormat(8, False, True),
+    SIGNED_INT64_LE:    MachineFormat(8, True, False),
+    SIGNED_INT64_BE:    MachineFormat(8, True, True),
+    IEEE_754_FLOAT_LE:  MachineFormat(4, False, False),
+    IEEE_754_FLOAT_BE:  MachineFormat(4, False, True),
+    IEEE_754_DOUBLE_LE: MachineFormat(8, False, False),
+    IEEE_754_DOUBLE_BE: MachineFormat(8, False, True),
+    UTF16_LE:           MachineFormat(4, False, False),
+    UTF16_BE:           MachineFormat(4, False, True),
+    UTF32_LE:           MachineFormat(8, False, False),
+    UTF32_BE:           MachineFormat(8, False, True),
+}
+MACHINE_FORMAT_CODE_MIN = min(format_descriptors)
+MACHINE_FORMAT_CODE_MAX = max(format_descriptors)
+
+
+ at unwrap_spec(typecode=str, mformat_code=int)
+def array_reconstructor(space, w_cls, typecode, mformat_code, w_items):
+    # Fast path: machine format code corresponds to the
+    # platform-independent typecode.
+    if mformat_code == typecode_to_mformat_code(typecode):
+        return interp_array.w_array(
+            space, w_cls, typecode, Arguments(space, [w_items]))
+
+    if typecode not in interp_array.types:
+        raise OperationError(space.w_ValueError,
+                             space.wrap("invalid type code"))
+    if (mformat_code < MACHINE_FORMAT_CODE_MIN or
+        mformat_code > MACHINE_FORMAT_CODE_MAX):
+        raise OperationError(space.w_ValueError,
+                             space.wrap("invalid machine format code"))
+
+    # Slow path: Decode the byte string according to the given machine
+    # format code. This occurs when the computer unpickling the array
+    # object is architecturally different from the one that pickled
+    # the array.
+    if (mformat_code == IEEE_754_FLOAT_LE or
+        mformat_code == IEEE_754_FLOAT_BE or
+        mformat_code == IEEE_754_DOUBLE_LE or
+        mformat_code == IEEE_754_DOUBLE_BE):
+
+        memstr = space.bytes_w(w_items)
+        descr = format_descriptors[mformat_code]
+        converted_items = [
+            space.wrap(ieee.unpack_float(
+                    memstr[i:i+descr.bytes],
+                    descr.big_endian))
+            for i in range(0, len(memstr), descr.bytes)]
+        w_converted_items = space.newlist(converted_items)
+
+    elif mformat_code == UTF16_LE:
+        w_converted_items = space.call_method(
+            w_items, "decode", space.wrap("utf-16-le"))
+    elif mformat_code == UTF16_BE:
+        w_converted_items = space.call_method(
+            w_items, "decode", space.wrap("utf-16-be"))
+    elif mformat_code == UTF32_LE:
+        w_converted_items = space.call_method(
+            w_items, "decode", space.wrap("utf-32-le"))
+    elif mformat_code == UTF32_BE:
+        w_converted_items = space.call_method(
+            w_items, "decode", space.wrap("utf-32-be"))
+    else:
+        descr = format_descriptors[mformat_code]
+        # If possible, try to pack array's items using a data type
+        # that fits better. This may result in an array with narrower
+        # or wider elements.
+        #
+        # For example, if a 32-bit machine pickles a L-code array of
+        # unsigned longs, then the array will be unpickled by 64-bit
+        # machine as an I-code array of unsigned ints.
+        #
+        # XXX: Is it possible to write a unit test for this?
+        for tc in interp_array.unroll_typecodes:
+            typecode_descr = interp_array.types[tc]
+            if (typecode_descr.is_integer_type() and
+                typecode_descr.bytes == descr.bytes and
+                typecode_descr.signed == descr.signed):
+                typecode = tc
+                break
+
+        memstr = space.bytes_w(w_items)
+        converted_items = [
+            space.newlong_from_rbigint(rbigint.rbigint.frombytes(
+                memstr[i:i+descr.bytes],
+                descr.big_endian and 'big' or 'little',
+                descr.signed))
+            for i in range(0, len(memstr), descr.bytes)]
+        w_converted_items = space.newlist(converted_items)
+
+    return interp_array.w_array(
+        space, w_cls, typecode, Arguments(space, [w_converted_items]))
+
+def typecode_to_mformat_code(typecode):
+    intsize = 0
+    if typecode == 'b':
+        return SIGNED_INT8
+    elif typecode == 'B':
+        return UNSIGNED_INT8
+    elif typecode == 'u':
+        if runicode.MAXUNICODE == 0xffff:
+            return UTF16_LE + IS_BIG_ENDIAN
+        else:
+            return UTF32_LE + IS_BIG_ENDIAN
+    elif typecode == 'f':
+        return IEEE_754_FLOAT_LE + IS_BIG_ENDIAN
+    elif typecode == 'd':
+        return IEEE_754_DOUBLE_LE + IS_BIG_ENDIAN
+    # Integers
+    elif typecode == 'h':
+        intsize = rffi.sizeof(rffi.SHORT)
+        is_signed = True
+    elif typecode == 'H':
+        intsize = rffi.sizeof(rffi.SHORT)
+        is_signed = False
+    elif typecode == 'i':
+        intsize = rffi.sizeof(rffi.INT)
+        is_signed = True
+    elif typecode == 'I':
+        intsize = rffi.sizeof(rffi.INT)
+        is_signed = False
+    elif typecode == 'l':
+        intsize = rffi.sizeof(rffi.LONG)
+        is_signed = True
+    elif typecode == 'L':
+        intsize = rffi.sizeof(rffi.LONG)
+        is_signed = False
+    else:
+        return UNKNOWN_FORMAT
+    if intsize == 2:
+        return UNSIGNED_INT16_LE + IS_BIG_ENDIAN + (2 * is_signed)
+    elif intsize == 4:
+        return UNSIGNED_INT32_LE + IS_BIG_ENDIAN + (2 * is_signed)
+    elif intsize == 8:
+        return UNSIGNED_INT64_LE + IS_BIG_ENDIAN + (2 * is_signed)
+    return UNKNOWN_FORMAT
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -524,18 +524,18 @@
     def test_reduce(self):
         import pickle
         a = self.array('i', [1, 2, 3])
-        s = pickle.dumps(a, 1)
+        s = pickle.dumps(a)
         b = pickle.loads(s)
         assert a == b
 
         a = self.array('l')
-        s = pickle.dumps(a, 1)
+        s = pickle.dumps(a)
         b = pickle.loads(s)
         assert len(b) == 0 and b.typecode == 'l'
 
         a = self.array('i', [1, 2, 4])
         i = iter(a)
-        #raises(TypeError, pickle.dumps, i, 1)
+        #raises(TypeError, pickle.dumps, i)
 
     def test_copy_swap(self):
         a = self.array('i', [1, 2, 3])
@@ -920,3 +920,130 @@
     spaceconfig = AppTestArray.spaceconfig.copy()
     spaceconfig['objspace.std.builtinshortcut'] = True
 
+
+class AppTestArrayReconstructor:
+    spaceconfig = dict(usemodules=('array', 'struct'))
+
+    def test_error(self):
+        import array
+        array_reconstructor = array._array_reconstructor
+        UNKNOWN_FORMAT = -1
+        raises(TypeError, array_reconstructor,
+               "", "b", 0, b"")
+        raises(TypeError, array_reconstructor,
+               str, "b", 0, b"")
+        raises(TypeError, array_reconstructor,
+               array.array, "b", '', b"")
+        raises(TypeError, array_reconstructor,
+               array.array, "b", 0, "")
+        raises(ValueError, array_reconstructor,
+               array.array, "?", 0, b"")
+        raises(ValueError, array_reconstructor,
+               array.array, "b", UNKNOWN_FORMAT, b"")
+        raises(ValueError, array_reconstructor,
+               array.array, "b", 22, b"")
+        raises(ValueError, array_reconstructor,
+               array.array, "d", 16, b"a")
+
+    def test_numbers(self):
+        import array, struct
+        array_reconstructor = array._array_reconstructor
+        UNSIGNED_INT8 = 0
+        SIGNED_INT8 = 1
+        UNSIGNED_INT16_LE = 2
+        UNSIGNED_INT16_BE = 3
+        SIGNED_INT16_LE = 4
+        SIGNED_INT16_BE = 5
+        UNSIGNED_INT32_LE = 6
+        UNSIGNED_INT32_BE = 7
+        SIGNED_INT32_LE = 8
+        SIGNED_INT32_BE = 9
+        UNSIGNED_INT64_LE = 10
+        UNSIGNED_INT64_BE = 11
+        SIGNED_INT64_LE = 12
+        SIGNED_INT64_BE = 13
+        IEEE_754_FLOAT_LE = 14
+        IEEE_754_FLOAT_BE = 15
+        IEEE_754_DOUBLE_LE = 16
+        IEEE_754_DOUBLE_BE = 17
+        testcases = (
+            (['B', 'H', 'I', 'L'], UNSIGNED_INT8, '=BBBB',
+             [0x80, 0x7f, 0, 0xff]),
+            (['b', 'h', 'i', 'l'], SIGNED_INT8, '=bbb',
+             [-0x80, 0x7f, 0]),
+            (['H', 'I', 'L'], UNSIGNED_INT16_LE, '<HHHH',
+             [0x8000, 0x7fff, 0, 0xffff]),
+            (['H', 'I', 'L'], UNSIGNED_INT16_BE, '>HHHH',
+             [0x8000, 0x7fff, 0, 0xffff]),
+            (['h', 'i', 'l'], SIGNED_INT16_LE, '<hhh',
+             [-0x8000, 0x7fff, 0]),
+            (['h', 'i', 'l'], SIGNED_INT16_BE, '>hhh',
+             [-0x8000, 0x7fff, 0]),
+            (['I', 'L'], UNSIGNED_INT32_LE, '<IIII',
+             [1<<31, (1<<31)-1, 0, (1<<32)-1]),
+            (['I', 'L'], UNSIGNED_INT32_BE, '>IIII',
+             [1<<31, (1<<31)-1, 0, (1<<32)-1]),
+            (['i', 'l'], SIGNED_INT32_LE, '<iii',
+             [-1<<31, (1<<31)-1, 0]),
+            (['i', 'l'], SIGNED_INT32_BE, '>iii',
+             [-1<<31, (1<<31)-1, 0]),
+            (['L'], UNSIGNED_INT64_LE, '<QQQQ',
+             [1<<31, (1<<31)-1, 0, (1<<32)-1]),
+            (['L'], UNSIGNED_INT64_BE, '>QQQQ',
+             [1<<31, (1<<31)-1, 0, (1<<32)-1]),
+            (['l'], SIGNED_INT64_LE, '<qqq',
+             [-1<<31, (1<<31)-1, 0]),
+            (['l'], SIGNED_INT64_BE, '>qqq',
+             [-1<<31, (1<<31)-1, 0]),
+            # The following tests for INT64 will raise an OverflowError
+            # when run on a 32-bit machine. The tests are simply skipped
+            # in that case.
+            (['L'], UNSIGNED_INT64_LE, '<QQQQ',
+             [1<<63, (1<<63)-1, 0, (1<<64)-1]),
+            (['L'], UNSIGNED_INT64_BE, '>QQQQ',
+             [1<<63, (1<<63)-1, 0, (1<<64)-1]),
+            (['l'], SIGNED_INT64_LE, '<qqq',
+             [-1<<63, (1<<63)-1, 0]),
+            (['l'], SIGNED_INT64_BE, '>qqq',
+             [-1<<63, (1<<63)-1, 0]),
+            (['f'], IEEE_754_FLOAT_LE, '<ffff',
+             [16711938.0, float('inf'), float('-inf'), -0.0]),
+            (['f'], IEEE_754_FLOAT_BE, '>ffff',
+             [16711938.0, float('inf'), float('-inf'), -0.0]),
+            (['d'], IEEE_754_DOUBLE_LE, '<dddd',
+             [9006104071832581.0, float('inf'), float('-inf'), -0.0]),
+            (['d'], IEEE_754_DOUBLE_BE, '>dddd',
+             [9006104071832581.0, float('inf'), float('-inf'), -0.0])
+        )
+        for testcase in testcases:
+            valid_typecodes, mformat_code, struct_fmt, values = testcase
+            arraystr = struct.pack(struct_fmt, *values)
+            for typecode in valid_typecodes:
+                try:
+                    a = array.array(typecode, values)
+                except OverflowError:
+                    continue  # Skip this test case.
+                b = array_reconstructor(
+                    array.array, typecode, mformat_code, arraystr)
+                assert a == b
+
+    def test_unicode(self):
+        import array
+        array_reconstructor = array._array_reconstructor
+        UTF16_LE = 18
+        UTF16_BE = 19
+        UTF32_LE = 20
+        UTF32_BE = 21
+        teststr = "Bonne Journ\xe9e \U0002030a\U00020347"
+        testcases = (
+            (UTF16_LE, "UTF-16-LE"),
+            (UTF16_BE, "UTF-16-BE"),
+            (UTF32_LE, "UTF-32-LE"),
+            (UTF32_BE, "UTF-32-BE")
+        )
+        for testcase in testcases:
+            mformat_code, encoding = testcase
+            a = array.array('u', teststr)
+            b = array_reconstructor(
+                array.array, 'u', mformat_code, teststr.encode(encoding))
+            assert a == b