[Python-checkins] bpo-37596: Make `set` and `frozenset` marshalling deterministic (GH-27926)

ambv webhook-mailer at python.org
Wed Aug 25 07:14:39 EDT 2021


https://github.com/python/cpython/commit/33d95c6facdfda3c8c0feffa7a99184e4abc2f63
commit: 33d95c6facdfda3c8c0feffa7a99184e4abc2f63
branch: main
author: Brandt Bucher <brandt at python.org>
committer: ambv <lukasz at langa.pl>
date: 2021-08-25T13:14:34+02:00
summary:

bpo-37596: Make `set` and `frozenset` marshalling deterministic (GH-27926)

files:
A Misc/NEWS.d/next/Library/2021-08-23-21-39-59.bpo-37596.ojRcwB.rst
M Lib/test/test_marshal.py
M Python/marshal.c

diff --git a/Lib/test/test_marshal.py b/Lib/test/test_marshal.py
index d20b9d2c1ff39..bdfe79fbbecb3 100644
--- a/Lib/test/test_marshal.py
+++ b/Lib/test/test_marshal.py
@@ -344,6 +344,31 @@ def test_eof(self):
         for i in range(len(data)):
             self.assertRaises(EOFError, marshal.loads, data[0: i])
 
+    def test_deterministic_sets(self):
+        # bpo-37596: To support reproducible builds, sets and frozensets need to
+        # have their elements serialized in a consistent order (even when they
+        # have been scrambled by hash randomization):
+        for kind in ("set", "frozenset"):
+            for elements in (
+                "float('nan'), b'a', b'b', b'c', 'x', 'y', 'z'",
+                # Also test for bad interactions with backreferencing:
+                "('string', 1), ('string', 2), ('string', 3)",
+            ):
+                s = f"{kind}([{elements}])"
+                with self.subTest(s):
+                    # First, make sure that our test case still has different
+                    # orders under hash seeds 0 and 1. If this check fails, we
+                    # need to update this test with different elements:
+                    args = ["-c", f"print({s})"]
+                    _, repr_0, _ = assert_python_ok(*args, PYTHONHASHSEED="0")
+                    _, repr_1, _ = assert_python_ok(*args, PYTHONHASHSEED="1")
+                    self.assertNotEqual(repr_0, repr_1)
+                    # Then, perform the actual test:
+                    args = ["-c", f"import marshal; print(marshal.dumps({s}))"]
+                    _, dump_0, _ = assert_python_ok(*args, PYTHONHASHSEED="0")
+                    _, dump_1, _ = assert_python_ok(*args, PYTHONHASHSEED="1")
+                    self.assertEqual(dump_0, dump_1)
+
 LARGE_SIZE = 2**31
 pointer_size = 8 if sys.maxsize > 0xFFFFFFFF else 4
 
diff --git a/Misc/NEWS.d/next/Library/2021-08-23-21-39-59.bpo-37596.ojRcwB.rst b/Misc/NEWS.d/next/Library/2021-08-23-21-39-59.bpo-37596.ojRcwB.rst
new file mode 100644
index 0000000000000..81fdfeb629456
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-08-23-21-39-59.bpo-37596.ojRcwB.rst
@@ -0,0 +1,2 @@
+Ensure that :class:`set` and :class:`frozenset` objects are always
+:mod:`marshalled <marshal>` reproducibly.
diff --git a/Python/marshal.c b/Python/marshal.c
index 1260704c74c0b..b69c4d09641da 100644
--- a/Python/marshal.c
+++ b/Python/marshal.c
@@ -503,9 +503,41 @@ w_complex_object(PyObject *v, char flag, WFILE *p)
             W_TYPE(TYPE_SET, p);
         n = PySet_GET_SIZE(v);
         W_SIZE(n, p);
+        // bpo-37596: To support reproducible builds, sets and frozensets need
+        // to have their elements serialized in a consistent order (even when
+        // they have been scrambled by hash randomization). To ensure this, we
+        // use an order equivalent to sorted(v, key=marshal.dumps):
+        PyObject *pairs = PyList_New(0);
+        if (pairs == NULL) {
+            p->error = WFERR_NOMEMORY;
+            return;
+        }
         while (_PySet_NextEntry(v, &pos, &value, &hash)) {
+            PyObject *dump = PyMarshal_WriteObjectToString(value, p->version);
+            if (dump == NULL) {
+                p->error = WFERR_UNMARSHALLABLE;
+                goto anyset_done;
+            }
+            PyObject *pair = PyTuple_Pack(2, dump, value);
+            Py_DECREF(dump);
+            if (pair == NULL || PyList_Append(pairs, pair)) {
+                p->error = WFERR_NOMEMORY;
+                Py_XDECREF(pair);
+                goto anyset_done;
+            }
+            Py_DECREF(pair);
+        }
+        if (PyList_Sort(pairs)) {
+            p->error = WFERR_NOMEMORY;
+            goto anyset_done;
+        }
+        for (Py_ssize_t i = 0; i < n; i++) {
+            PyObject *pair = PyList_GET_ITEM(pairs, i);
+            value = PyTuple_GET_ITEM(pair, 1);
             w_object(value, p);
         }
+    anyset_done:
+        Py_DECREF(pairs);
     }
     else if (PyCode_Check(v)) {
         PyCodeObject *co = (PyCodeObject *)v;



More information about the Python-checkins mailing list