[Python-checkins] cpython: Make framing optional in pickle protocol 4.

alexandre.vassalotti python-checkins at python.org
Sun Nov 24 05:29:56 CET 2013


http://hg.python.org/cpython/rev/de9bda43d552
changeset:   87485:de9bda43d552
user:        Alexandre Vassalotti <alexandre at peadrop.com>
date:        Sat Nov 23 20:30:03 2013 -0800
summary:
  Make framing optional in pickle protocol 4.

This will allow us to control in the future whether to use framing or not.
For example, we may want to turn it off for tiny pickle where it doesn't
help.

The change also improves performance slightly:

### fastpickle ###
Min: 0.608517 -> 0.557358: 1.09x faster
Avg: 0.798892 -> 0.694738: 1.15x faster
Significant (t=3.45)
Stddev: 0.17145 -> 0.12704: 1.3496x smaller
Timeline: http://goo.gl/3xQE1J

### pickle_dict ###
Min: 0.669920 -> 0.615271: 1.09x faster
Avg: 0.733633 -> 0.645058: 1.14x faster
Significant (t=5.05)
Stddev: 0.12041 -> 0.02961: 4.0662x smaller
Timeline: http://goo.gl/LpLSXI

### pickle_list ###
Min: 0.397583 -> 0.368112: 1.08x faster
Avg: 0.412784 -> 0.397223: 1.04x faster
Significant (t=2.78)
Stddev: 0.01518 -> 0.03653: 2.4068x larger
Timeline: http://goo.gl/v39E59

### unpickle_list ###
Min: 0.692935 -> 0.594870: 1.16x faster
Avg: 0.730012 -> 0.628395: 1.16x faster
Significant (t=17.76)
Stddev: 0.02720 -> 0.02995: 1.1012x larger
Timeline: http://goo.gl/2P9AEt

The following not significant results are hidden, use -v to show them:
fastunpickle.

files:
  Lib/pickle.py            |  130 +++++++++-----------
  Lib/test/pickletester.py |   39 ++++++
  Modules/_pickle.c        |  160 +++++---------------------
  3 files changed, 132 insertions(+), 197 deletions(-)


diff --git a/Lib/pickle.py b/Lib/pickle.py
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -188,87 +188,72 @@
         self.file_write = file_write
         self.current_frame = None
 
-    def _commit_frame(self):
-        f = self.current_frame
-        with f.getbuffer() as data:
-            n = len(data)
-            write = self.file_write
-            write(FRAME)
-            write(pack("<Q", n))
-            write(data)
-        f.seek(0)
-        f.truncate()
-
     def start_framing(self):
         self.current_frame = io.BytesIO()
 
     def end_framing(self):
-        if self.current_frame is not None:
-            self._commit_frame()
+        if self.current_frame and self.current_frame.tell() > 0:
+            self.commit_frame(force=True)
             self.current_frame = None
 
+    def commit_frame(self, force=False):
+        if self.current_frame:
+            f = self.current_frame
+            if f.tell() >= self._FRAME_SIZE_TARGET or force:
+                with f.getbuffer() as data:
+                    n = len(data)
+                    write = self.file_write
+                    write(FRAME)
+                    write(pack("<Q", n))
+                    write(data)
+                f.seek(0)
+                f.truncate()
+
     def write(self, data):
-        f = self.current_frame
-        if f is None:
+        if self.current_frame:
+            return self.current_frame.write(data)
+        else:
             return self.file_write(data)
-        else:
-            n = len(data)
-            if f.tell() >= self._FRAME_SIZE_TARGET:
-                self._commit_frame()
-            return f.write(data)
+
 
 class _Unframer:
 
     def __init__(self, file_read, file_readline, file_tell=None):
         self.file_read = file_read
         self.file_readline = file_readline
-        self.file_tell = file_tell
-        self.framing_enabled = False
         self.current_frame = None
-        self.frame_start = None
 
     def read(self, n):
-        if n == 0:
-            return b''
-        _file_read = self.file_read
-        if not self.framing_enabled:
-            return _file_read(n)
-        f = self.current_frame
-        if f is not None:
-            data = f.read(n)
-            if data:
-                if len(data) < n:
-                    raise UnpicklingError(
-                        "pickle exhausted before end of frame")
-                return data
-        frame_opcode = _file_read(1)
-        if frame_opcode != FRAME:
-            raise UnpicklingError(
-                "expected a FRAME opcode, got {} instead".format(frame_opcode))
-        frame_size, = unpack("<Q", _file_read(8))
-        if frame_size > sys.maxsize:
-            raise ValueError("frame size > sys.maxsize: %d" % frame_size)
-        if self.file_tell is not None:
-            self.frame_start = self.file_tell()
-        f = self.current_frame = io.BytesIO(_file_read(frame_size))
-        self.readline = f.readline
-        data = f.read(n)
-        assert len(data) == n, (len(data), n)
-        return data
+        if self.current_frame:
+            data = self.current_frame.read(n)
+            if not data and n != 0:
+                self.current_frame = None
+                return self.file_read(n)
+            if len(data) < n:
+                raise UnpicklingError(
+                    "pickle exhausted before end of frame")
+            return data
+        else:
+            return self.file_read(n)
 
     def readline(self):
-        if not self.framing_enabled:
+        if self.current_frame:
+            data = self.current_frame.readline()
+            if not data:
+                self.current_frame = None
+                return self.file_readline()
+            if data[-1] != b'\n':
+                raise UnpicklingError(
+                    "pickle exhausted before end of frame")
+            return data
+        else:
             return self.file_readline()
-        else:
-            return self.current_frame.readline()
 
-    def tell(self):
-        if self.file_tell is None:
-            return None
-        elif self.current_frame is None:
-            return self.file_tell()
-        else:
-            return self.frame_start + self.current_frame.tell()
+    def load_frame(self, frame_size):
+        if self.current_frame and self.current_frame.read() != b'':
+            raise UnpicklingError(
+                "beginning of a new frame before end of current frame")
+        self.current_frame = io.BytesIO(self.file_read(frame_size))
 
 
 # Tools used for pickling.
@@ -392,6 +377,8 @@
             self._file_write = file.write
         except AttributeError:
             raise TypeError("file must have a 'write' attribute")
+        self.framer = _Framer(self._file_write)
+        self.write = self.framer.write
         self.memo = {}
         self.proto = int(protocol)
         self.bin = protocol >= 1
@@ -417,18 +404,12 @@
             raise PicklingError("Pickler.__init__() was not called by "
                                 "%s.__init__()" % (self.__class__.__name__,))
         if self.proto >= 2:
-            self._file_write(PROTO + pack("<B", self.proto))
+            self.write(PROTO + pack("<B", self.proto))
         if self.proto >= 4:
-            framer = _Framer(self._file_write)
-            framer.start_framing()
-            self.write = framer.write
-        else:
-            framer = None
-            self.write = self._file_write
+            self.framer.start_framing()
         self.save(obj)
         self.write(STOP)
-        if framer is not None:
-            framer.end_framing()
+        self.framer.end_framing()
 
     def memoize(self, obj):
         """Store an object in the memo."""
@@ -475,6 +456,8 @@
         return GET + repr(i).encode("ascii") + b'\n'
 
     def save(self, obj, save_persistent_id=True):
+        self.framer.commit_frame()
+
         # Check for persistent id (defined by a subclass)
         pid = self.persistent_id(obj)
         if pid is not None and save_persistent_id:
@@ -1078,10 +1061,15 @@
         if not 0 <= proto <= HIGHEST_PROTOCOL:
             raise ValueError("unsupported pickle protocol: %d" % proto)
         self.proto = proto
-        if proto >= 4:
-            self._unframer.framing_enabled = True
     dispatch[PROTO[0]] = load_proto
 
+    def load_frame(self):
+        frame_size, = unpack('<Q', self.read(8))
+        if frame_size > sys.maxsize:
+            raise ValueError("frame size > sys.maxsize: %d" % frame_size)
+        self._unframer.load_frame(frame_size)
+    dispatch[FRAME[0]] = load_frame
+
     def load_persid(self):
         pid = self.readline()[:-1].decode("ascii")
         self.append(self.persistent_load(pid))
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1353,6 +1353,45 @@
                 n_frames = pickled.count(b'\x00\x00\x00\x00\x00')
                 self.assertGreaterEqual(n_frames, len(obj))
 
+    def test_optional_frames(self):
+        if pickle.HIGHEST_PROTOCOL < 4:
+            return
+
+        def remove_frames(pickled, keep_frame=None):
+            """Remove frame opcodes from the given pickle."""
+            frame_starts = []
+            # 1 byte for the opcode and 8 for the argument
+            frame_opcode_size = 9
+            for opcode, _, pos in pickletools.genops(pickled):
+                if opcode.name == 'FRAME':
+                    frame_starts.append(pos)
+
+            newpickle = bytearray()
+            last_frame_end = 0
+            for i, pos in enumerate(frame_starts):
+                if keep_frame and keep_frame(i):
+                    continue
+                newpickle += pickled[last_frame_end:pos]
+                last_frame_end = pos + frame_opcode_size
+            newpickle += pickled[last_frame_end:]
+            return newpickle
+
+        target_frame_size = 64 * 1024
+        num_frames = 20
+        obj = [bytes([i]) * target_frame_size for i in range(num_frames)]
+
+        for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
+            pickled = self.dumps(obj, proto)
+
+            frameless_pickle = remove_frames(pickled)
+            self.assertEqual(count_opcode(pickle.FRAME, frameless_pickle), 0)
+            self.assertEqual(obj, self.loads(frameless_pickle))
+
+            some_frames_pickle = remove_frames(pickled, lambda i: i % 2 == 0)
+            self.assertLess(count_opcode(pickle.FRAME, some_frames_pickle),
+                            count_opcode(pickle.FRAME, pickled))
+            self.assertEqual(obj, self.loads(some_frames_pickle))
+
     def test_nested_names(self):
         global Nested
         class Nested:
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -110,10 +110,6 @@
     /* Initial size of the write buffer of Pickler. */
     WRITE_BUF_SIZE = 4096,
 
-    /* Maximum size of the write buffer of Pickler when pickling to a
-       stream.  This is ignored for in-memory pickling. */
-    MAX_WRITE_BUF_SIZE = 64 * 1024,
-
     /* Prefetch size when unpickling (disabled on unpeekable streams) */
     PREFETCH = 8192 * 16,
 
@@ -381,7 +377,6 @@
     char *input_line;
     Py_ssize_t input_len;
     Py_ssize_t next_read_idx;
-    Py_ssize_t frame_end_idx;
     Py_ssize_t prefetched_idx;  /* index of first prefetched byte */
 
     PyObject *read;             /* read() method of the input stream. */
@@ -401,7 +396,6 @@
     int proto;                  /* Protocol of the pickle loaded. */
     int fix_imports;            /* Indicate whether Unpickler should fix
                                    the name of globals pickled by Python 2.x. */
-    int framing;                /* True when framing is enabled, proto >= 4 */
 } UnpicklerObject;
 
 /* Forward declarations */
@@ -802,46 +796,6 @@
         n = data_len;
 
     required = self->output_len + n;
-    if (self->write != NULL && required > MAX_WRITE_BUF_SIZE) {
-        /* XXX This reallocates a new buffer every time, which is a bit
-           wasteful. */
-        if (_Pickler_FlushToFile(self) < 0)
-            return -1;
-        if (_Pickler_ClearBuffer(self) < 0)
-            return -1;
-        /* The previous frame was just committed by _Pickler_FlushToFile */
-        need_new_frame = self->framing;
-        if (need_new_frame)
-            n = data_len + FRAME_HEADER_SIZE;
-        else
-            n = data_len;
-        required = self->output_len + n;
-    }
-    if (self->write != NULL && n > MAX_WRITE_BUF_SIZE) {
-        /* For large pickle chunks, we write directly to the output
-           file instead of buffering. Note the buffer is empty at this
-           point (it was flushed above, since required >= n). */
-        PyObject *output, *result;
-        if (need_new_frame) {
-            char frame_header[FRAME_HEADER_SIZE];
-            _Pickler_WriteFrameHeader(self, frame_header, (size_t) data_len);
-            output = PyBytes_FromStringAndSize(frame_header, FRAME_HEADER_SIZE);
-            if (output == NULL)
-                return -1;
-            result = _Pickler_FastCall(self, self->write, output);
-            Py_XDECREF(result);
-            if (result == NULL)
-                return -1;
-        }
-        /* XXX we could spare an intermediate copy and pass
-           a memoryview instead */
-        output = PyBytes_FromStringAndSize(s, data_len);
-        if (output == NULL)
-            return -1;
-        result = _Pickler_FastCall(self, self->write, output);
-        Py_XDECREF(result);
-        return (result == NULL) ? -1 : 0;
-    }
     if (required > self->max_output_len) {
         /* Make place in buffer for the pickle chunk */
         if (self->output_len >= PY_SSIZE_T_MAX / 2 - n) {
@@ -987,7 +941,6 @@
     self->input_buffer = self->buffer.buf;
     self->input_len = self->buffer.len;
     self->next_read_idx = 0;
-    self->frame_end_idx = -1;
     self->prefetched_idx = self->input_len;
     return self->input_len;
 }
@@ -1052,7 +1005,7 @@
         return -1;
 
     /* Prefetch some data without advancing the file pointer, if possible */
-    if (self->peek && !self->framing) {
+    if (self->peek) {
         PyObject *len, *prefetched;
         len = PyLong_FromSsize_t(PREFETCH);
         if (len == NULL) {
@@ -1100,7 +1053,7 @@
    Returns -1 (with an exception set) on failure. On success, return the
    number of chars read. */
 static Py_ssize_t
-_Unpickler_ReadUnframed(UnpicklerObject *self, char **s, Py_ssize_t n)
+_Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
 {
     Py_ssize_t num_read;
 
@@ -1126,67 +1079,6 @@
 }
 
 static Py_ssize_t
-_Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
-{
-    if (self->framing &&
-        (self->frame_end_idx == -1 ||
-         self->frame_end_idx <= self->next_read_idx)) {
-        /* Need to read new frame */
-        char *dummy = NULL;
-        unsigned char *frame_start;
-        size_t frame_len;
-        if (_Unpickler_ReadUnframed(self, &dummy, FRAME_HEADER_SIZE) < 0)
-            return -1;
-        frame_start = (unsigned char *) dummy;
-        if (frame_start[0] != (unsigned char)FRAME) {
-            PyErr_Format(UnpicklingError,
-                         "expected FRAME opcode, got 0x%x instead",
-                         frame_start[0]);
-            return -1;
-        }
-        frame_len =  (size_t) frame_start[1];
-        frame_len |= (size_t) frame_start[2] << 8;
-        frame_len |= (size_t) frame_start[3] << 16;
-        frame_len |= (size_t) frame_start[4] << 24;
-#if SIZEOF_SIZE_T >= 8
-        frame_len |= (size_t) frame_start[5] << 32;
-        frame_len |= (size_t) frame_start[6] << 40;
-        frame_len |= (size_t) frame_start[7] << 48;
-        frame_len |= (size_t) frame_start[8] << 56;
-#else
-        if (frame_start[5] || frame_start[6] ||
-            frame_start[7] || frame_start[8]) {
-            PyErr_Format(PyExc_OverflowError,
-                         "Frame size too large for 32-bit build");
-            return -1;
-        }
-#endif
-        if (frame_len > PY_SSIZE_T_MAX) {
-            PyErr_Format(UnpicklingError, "Invalid frame length");
-            return -1;
-        }
-        if ((Py_ssize_t) frame_len < n) {
-            PyErr_Format(UnpicklingError, "Bad framing");
-            return -1;
-        }
-        if (_Unpickler_ReadUnframed(self, &dummy /* unused */,
-                                    frame_len) < 0)
-            return -1;
-        /* Rewind to start of frame */
-        self->frame_end_idx = self->next_read_idx;
-        self->next_read_idx -= frame_len;
-    }
-    if (self->framing) {
-        /* Check for bad input */
-        if (n + self->next_read_idx > self->frame_end_idx) {
-            PyErr_Format(UnpicklingError, "Bad framing");
-            return -1;
-        }
-    }
-    return _Unpickler_ReadUnframed(self, s, n);
-}
-
-static Py_ssize_t
 _Unpickler_CopyLine(UnpicklerObject *self, char *line, Py_ssize_t len,
                     char **result)
 {
@@ -1336,7 +1228,6 @@
     self->input_line = NULL;
     self->input_len = 0;
     self->next_read_idx = 0;
-    self->frame_end_idx = -1;
     self->prefetched_idx = 0;
     self->read = NULL;
     self->readline = NULL;
@@ -1347,7 +1238,6 @@
     self->num_marks = 0;
     self->marks_size = 0;
     self->proto = 0;
-    self->framing = 0;
     self->fix_imports = 0;
     memset(&self->buffer, 0, sizeof(Py_buffer));
     self->memo_size = 32;
@@ -1474,8 +1364,6 @@
 
     if (self->fast)
         return 0;
-    if (_Pickler_OpcodeBoundary(self))
-        return -1;
 
     idx = PyMemoTable_Size(self->memo);
     if (PyMemoTable_Set(self->memo, obj, idx) < 0)
@@ -3661,6 +3549,9 @@
     PyObject *reduce_value = NULL;
     int status = 0;
 
+    if (_Pickler_OpcodeBoundary(self) < 0)
+        return -1;
+
     if (Py_EnterRecursiveCall(" while pickling an object"))
         return -1;
 
@@ -3855,8 +3746,7 @@
         status = -1;
     }
   done:
-    if (status == 0)
-        status = _Pickler_OpcodeBoundary(self);
+
     Py_LeaveRecursiveCall();
     Py_XDECREF(reduce_func);
     Py_XDECREF(reduce_value);
@@ -4514,7 +4404,7 @@
     int i;
     size_t x = 0;
 
-    for (i = 0; i < nbytes; i++) {
+    for (i = 0; i < nbytes && i < sizeof(size_t); i++) {
         x |= (size_t) s[i] << (8 * i);
     }
 
@@ -5972,7 +5862,6 @@
     i = (unsigned char)s[0];
     if (i <= HIGHEST_PROTOCOL) {
         self->proto = i;
-        self->framing = (self->proto >= 4);
         return 0;
     }
 
@@ -5980,16 +5869,39 @@
     return -1;
 }
 
+static int
+load_frame(UnpicklerObject *self)
+{
+    char *s;
+    Py_ssize_t frame_len;
+
+    if (_Unpickler_Read(self, &s, 8) < 0)
+        return -1;
+
+    frame_len = calc_binsize(s, 8);
+    if (frame_len < 0) {
+        PyErr_Format(PyExc_OverflowError,
+                     "FRAME length exceeds system's maximum of %zd bytes",
+                     PY_SSIZE_T_MAX);
+        return -1;
+    }
+
+    if (_Unpickler_Read(self, &s, frame_len) < 0)
+        return -1;
+
+    /* Rewind to start of frame */
+    self->next_read_idx -= frame_len;
+    return 0;
+}
+
 static PyObject *
 load(UnpicklerObject *self)
 {
-    PyObject *err;
     PyObject *value = NULL;
     char *s;
 
     self->num_marks = 0;
     self->proto = 0;
-    self->framing = 0;
     if (Py_SIZE(self->stack))
         Pdata_clear(self->stack, 0);
 
@@ -6063,6 +5975,7 @@
         OP(BINPERSID, load_binpersid)
         OP(REDUCE, load_reduce)
         OP(PROTO, load_proto)
+        OP(FRAME, load_frame)
         OP_ARG(EXT1, load_extension, 1)
         OP_ARG(EXT2, load_extension, 2)
         OP_ARG(EXT4, load_extension, 4)
@@ -6084,11 +5997,7 @@
         break;                  /* and we are done! */
     }
 
-    /* XXX: It is not clear what this is actually for. */
-    if ((err = PyErr_Occurred())) {
-        if (err == PyExc_EOFError) {
-            PyErr_SetNone(PyExc_EOFError);
-        }
+    if (PyErr_Occurred()) {
         return NULL;
     }
 
@@ -6383,7 +6292,6 @@
 
     self->arg = NULL;
     self->proto = 0;
-    self->framing = 0;
 
     return 0;
 }

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list