[Python-checkins] cpython: Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.
raymond.hettinger
python-checkins at python.org
Sat Aug 1 18:53:08 CEST 2015
https://hg.python.org/cpython/rev/cf707dd190a9
changeset: 97178:cf707dd190a9
user: Raymond Hettinger <python at rcn.com>
date: Sat Aug 01 09:53:00 2015 -0700
summary:
Issue #24762: Speed-up frozenset_hash() and greatly beef-up the comments.
files:
Objects/setobject.c | 70 ++++++++++++++++++++------------
1 files changed, 43 insertions(+), 27 deletions(-)
diff --git a/Objects/setobject.c b/Objects/setobject.c
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -739,41 +739,57 @@
return 0;
}
+/* Work to increase the bit dispersion for closely spaced hash values.
+ This is important because some use cases have many combinations of a
+ small number of elements with nearby hashes so that many distinct
+ combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
+{
+ return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
+
+/* Most of the constants in this hash algorithm are randomly chosen
+ large primes with "interesting bit patterns" and that passed tests
+ for good collision statistics on a variety of problematic datasets
+ including powersets and graph structures (such as David Eppstein's
+ graph recipes in Lib/test/test_set.py) */
+
static Py_hash_t
frozenset_hash(PyObject *self)
{
- /* Most of the constants in this hash algorithm are randomly choosen
- large primes with "interesting bit patterns" and that passed
- tests for good collision statistics on a variety of problematic
- datasets such as:
+ PySetObject *so = (PySetObject *)self;
+ Py_uhash_t hash = 1927868237UL;
+ setentry *entry;
- ps = []
- for r in range(21):
- ps += itertools.combinations(range(20), r)
- num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+ /* Make hash(frozenset({0})) distinct from hash(frozenset()) */
+ hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
- */
- PySetObject *so = (PySetObject *)self;
- Py_uhash_t h, hash = 1927868237UL;
- setentry *entry;
- Py_ssize_t pos = 0;
+ /* Xor-in shuffled bits from every entry's hash field because xor is
+ commutative and a frozenset hash should be independent of order.
- if (so->hash != -1)
- return so->hash;
+ For speed, include null entries and dummy entries and then
+ subtract out their effect afterwards so that the final hash
+ depends only on active entries. This allows the code to be
+ vectorized by the compiler and it saves the unpredictable
+ branches that would arise when trying to exclude null and dummy
+ entries on every iteration. */
- hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
- while (set_next(so, &pos, &entry)) {
- /* Work to increase the bit dispersion for closely spaced hash
- values. This is important because some use cases have many
- combinations of a small number of elements with nearby
- hashes so that many distinct combinations collapse to only
- a handful of distinct hash values. */
- h = entry->hash;
- hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
- }
- /* Make the final result spread-out in a different pattern
- than the algorithm for tuples or other python objects. */
+ for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+ hash ^= _shuffle_bits(entry->hash);
+
+ /* Remove the effect of an odd number NULL entries */
+ if ((so->mask + 1 - so->fill) & 1)
+ hash ^= _shuffle_bits(0);
+
+ /* Remove the effect of an odd number of dummy entries */
+ if ((so->fill - so->used) & 1)
+ hash ^= _shuffle_bits(-1);
+
+ /* Disperse patterns arising in nested frozensets */
hash = hash * 69069U + 907133923UL;
+
if (hash == (Py_uhash_t)-1)
hash = 590923713UL;
so->hash = hash;
--
Repository URL: https://hg.python.org/cpython
More information about the Python-checkins
mailing list