[Python-checkins] gh-94155: Reduce hash collisions for code objects (#100183)

sweeneyde webhook-mailer at python.org
Fri Dec 23 13:15:53 EST 2022


https://github.com/python/cpython/commit/a98d9ea56e7b473af54438ecc487a6bf1b4d6530
commit: a98d9ea56e7b473af54438ecc487a6bf1b4d6530
branch: main
author: Dennis Sweeney <36520290+sweeneyde at users.noreply.github.com>
committer: sweeneyde <36520290+sweeneyde at users.noreply.github.com>
date: 2022-12-23T13:15:47-05:00
summary:

gh-94155: Reduce hash collisions for code objects (#100183)

* Uses a better hashing algorithm to get better dispersion and remove commutativity.

* Incorporates `co_firstlineno`, `Py_SIZE(co)`, and bytecode instructions.

* This is now the entire set of criteria used in `code_richcompare`, except for `_PyCode_ConstantKey` (which would incorporate the types of `co_consts` rather than just their values).

files:
A Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst
M Lib/test/test_code.py
M Objects/codeobject.c

diff --git a/Lib/test/test_code.py b/Lib/test/test_code.py
index 02ab8fbcdb07..b13d5770abe8 100644
--- a/Lib/test/test_code.py
+++ b/Lib/test/test_code.py
@@ -465,6 +465,32 @@ def f():
         self.assertNotEqual(code_b, code_d)
         self.assertNotEqual(code_c, code_d)
 
+    def test_code_hash_uses_firstlineno(self):
+        c1 = (lambda: 1).__code__
+        c2 = (lambda: 1).__code__
+        self.assertNotEqual(c1, c2)
+        self.assertNotEqual(hash(c1), hash(c2))
+        c3 = c1.replace(co_firstlineno=17)
+        self.assertNotEqual(c1, c3)
+        self.assertNotEqual(hash(c1), hash(c3))
+
+    def test_code_hash_uses_order(self):
+        # Swapping posonlyargcount and kwonlyargcount should change the hash.
+        c = (lambda x, y, *, z=1, w=1: 1).__code__
+        self.assertEqual(c.co_argcount, 2)
+        self.assertEqual(c.co_posonlyargcount, 0)
+        self.assertEqual(c.co_kwonlyargcount, 2)
+        swapped = c.replace(co_posonlyargcount=2, co_kwonlyargcount=0)
+        self.assertNotEqual(c, swapped)
+        self.assertNotEqual(hash(c), hash(swapped))
+
+    def test_code_hash_uses_bytecode(self):
+        c = (lambda x, y: x + y).__code__
+        d = (lambda x, y: x * y).__code__
+        c1 = c.replace(co_code=d.co_code)
+        self.assertNotEqual(c, c1)
+        self.assertNotEqual(hash(c), hash(c1))
+
 
 def isinterned(s):
     return s is sys.intern(('_' + s + '_')[1:-1])
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst
new file mode 100644
index 000000000000..e7c7ed2fad0e
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-12-12-00-59-11.gh-issue-94155.LWE9y_.rst	
@@ -0,0 +1 @@
+Improved the hashing algorithm for code objects, mitigating some hash collisions.
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 1e5a92270be8..e174c6fee9cc 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -1842,28 +1842,41 @@ code_richcompare(PyObject *self, PyObject *other, int op)
 static Py_hash_t
 code_hash(PyCodeObject *co)
 {
-    Py_hash_t h, h0, h1, h2, h3;
-    h0 = PyObject_Hash(co->co_name);
-    if (h0 == -1) return -1;
-    h1 = PyObject_Hash(co->co_consts);
-    if (h1 == -1) return -1;
-    h2 = PyObject_Hash(co->co_names);
-    if (h2 == -1) return -1;
-    h3 = PyObject_Hash(co->co_localsplusnames);
-    if (h3 == -1) return -1;
-    Py_hash_t h4 = PyObject_Hash(co->co_linetable);
-    if (h4 == -1) {
-        return -1;
+    Py_uhash_t uhash = 20221211;
+    #define SCRAMBLE_IN(H) do {       \
+        uhash ^= (Py_uhash_t)(H);     \
+        uhash *= _PyHASH_MULTIPLIER;  \
+    } while (0)
+    #define SCRAMBLE_IN_HASH(EXPR) do {     \
+        Py_hash_t h = PyObject_Hash(EXPR);  \
+        if (h == -1) {                      \
+            return -1;                      \
+        }                                   \
+        SCRAMBLE_IN(h);                     \
+    } while (0)
+
+    SCRAMBLE_IN_HASH(co->co_name);
+    SCRAMBLE_IN_HASH(co->co_consts);
+    SCRAMBLE_IN_HASH(co->co_names);
+    SCRAMBLE_IN_HASH(co->co_localsplusnames);
+    SCRAMBLE_IN_HASH(co->co_linetable);
+    SCRAMBLE_IN_HASH(co->co_exceptiontable);
+    SCRAMBLE_IN(co->co_argcount);
+    SCRAMBLE_IN(co->co_posonlyargcount);
+    SCRAMBLE_IN(co->co_kwonlyargcount);
+    SCRAMBLE_IN(co->co_flags);
+    SCRAMBLE_IN(co->co_firstlineno);
+    SCRAMBLE_IN(Py_SIZE(co));
+    for (int i = 0; i < Py_SIZE(co); i++) {
+        int deop = _PyOpcode_Deopt[_Py_OPCODE(_PyCode_CODE(co)[i])];
+        SCRAMBLE_IN(deop);
+        SCRAMBLE_IN(_Py_OPARG(_PyCode_CODE(co)[i]));
+        i += _PyOpcode_Caches[deop];
     }
-    Py_hash_t h5 = PyObject_Hash(co->co_exceptiontable);
-    if (h5 == -1) {
-        return -1;
+    if ((Py_hash_t)uhash == -1) {
+        return -2;
     }
-    h = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^
-        co->co_argcount ^ co->co_posonlyargcount ^ co->co_kwonlyargcount ^
-        co->co_flags;
-    if (h == -1) h = -2;
-    return h;
+    return (Py_hash_t)uhash;
 }
 
 



More information about the Python-checkins mailing list