[pypy-commit] pypy default: Implement Siphash-2-4, the same hashing function as CPython 3.x.

arigo pypy.commits at gmail.com
Wed Jan 25 14:51:46 EST 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r89771:134351c66421
Date: 2017-01-25 20:51 +0100
http://bitbucket.org/pypy/pypy/changeset/134351c66421/

Log:	Implement Siphash-2-4, the same hashing function as CPython 3.x.
	Disabled by default.

diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -520,10 +520,22 @@
 # ----------
 
 HASH_ALGORITHM = "rpython"  # XXX Is there a better name?
+HASH_ALGORITHM_FIXED = False
 
-def _hash_string(s):
-    """The algorithm behind compute_hash() for a string or a unicode."""
+ at not_rpython
+def set_hash_algorithm(algo):
+    """Must be called very early, before any string is hashed with
+    compute_hash()!"""
+    global HASH_ALGORITHM
+    if HASH_ALGORITHM != algo:
+        assert not HASH_ALGORITHM_FIXED, "compute_hash() already called!"
+        assert algo in ("rpython", "siphash24")
+        HASH_ALGORITHM = algo
+
+
+def _hash_string_rpython(s):
     from rpython.rlib.rarithmetic import intmask
+
     length = len(s)
     if length == 0:
         return -1
@@ -535,6 +547,83 @@
     x ^= length
     return intmask(x)
 
+
+ at not_rpython
+def _hash_string_siphash24(s):
+    """This version is called when untranslated only."""
+    import array
+    from rpython.rlib.rsiphash import siphash24
+    from rpython.rtyper.lltypesystem import lltype, rffi
+    from rpython.rlib.rarithmetic import intmask
+
+    if isinstance(s, str):
+        pass
+    elif isinstance(s, unicode):
+        if rffi.sizeof(lltype.UniChar) == 4:
+            kind = "I"
+        else:
+            kind = "H"
+        s = array.array(kind, map(ord, s)).tostring()
+    else:
+        if lltype.typeOf(s).TO.chars.OF == lltype.Char:
+            kind = "B"
+        elif rffi.sizeof(lltype.UniChar) == 4:
+            kind = "I"
+        else:
+            kind = "H"
+        s = array.array(kind, map(ord, s.chars)).tostring()
+    ptr = rffi.str2charp(s)
+    x = siphash24(ptr, len(s))
+    rffi.free_charp(ptr)
+    return intmask(x)
+
+def ll_hash_string_siphash24(ll_s):
+    """Called from lltypesystem/rstr.py.  'll_s' is a rstr.STR or UNICODE."""
+    from rpython.rlib.rsiphash import siphash24
+    from rpython.rtyper.lltypesystem import lltype, rffi, rstr
+    from rpython.rlib.rarithmetic import intmask
+
+    length = len(ll_s.chars)
+    # no GC operation from here!
+    if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
+        addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
+    else:
+        addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+        length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+    x = siphash24(addr, length)
+    keepalive_until_here(ll_s)
+    return intmask(x)
+ll_hash_string_siphash24._jit_look_inside_ = False
+
+
+ at not_rpython
+def _hash_string(s):
+    """The algorithm behind compute_hash() for a string or a unicode.
+    This version is only for untranslated usage, and 's' is a str or unicode.
+    """
+    global HASH_ALGORITHM_FIXED
+    HASH_ALGORITHM_FIXED = True
+    if HASH_ALGORITHM == "rpython":
+        return _hash_string_rpython(s)
+    if HASH_ALGORITHM == "siphash24":
+        return _hash_string_siphash24(s)
+    raise NotImplementedError
+
+def ll_hash_string(ll_s):
+    """The algorithm behind compute_hash() for a string or a unicode.
+    This version is called from lltypesystem/rstr.py, and 'll_s' is a
+    rstr.STR or rstr.UNICODE.
+    """
+    if HASH_ALGORITHM == "rpython":
+        return _hash_string_rpython(ll_s.chars)
+    if HASH_ALGORITHM == "siphash24":
+        if we_are_translated():
+            return ll_hash_string_siphash24(ll_s)
+        else:
+            return _hash_string_siphash24(ll_s)
+    raise NotImplementedError
+
+
 def _hash_float(f):
     """The algorithm behind compute_hash() for a float.
     This implementation is identical to the CPython implementation,
diff --git a/rpython/rlib/rsiphash.py b/rpython/rlib/rsiphash.py
--- a/rpython/rlib/rsiphash.py
+++ b/rpython/rlib/rsiphash.py
@@ -2,6 +2,7 @@
 from contextlib import contextmanager
 from rpython.rlib import rarithmetic
 from rpython.rlib.objectmodel import not_rpython, always_inline
+from rpython.rlib.rgc import no_collect
 from rpython.rlib.rarithmetic import r_uint64
 from rpython.rlib.rawstorage import misaligned_is_fine
 from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
@@ -73,11 +74,11 @@
     return v0, v1, v2, v3
 
 
-def siphash24(ptr, size):
-    """Takes a CCHARP pointer and a size.  Returns the hash as a r_uint64,
+ at no_collect
+def siphash24(addr_in, size):
+    """Takes an address pointer and a size.  Returns the hash as a r_uint64,
     which can then be casted to the expected type."""
 
-    addr_in = llmemory.cast_ptr_to_adr(ptr)
     direct = (misaligned_is_fine or
                  (rffi.cast(lltype.Signed, addr_in) & 7) == 0)
 
diff --git a/rpython/rlib/test/test_rsiphash.py b/rpython/rlib/test/test_rsiphash.py
--- a/rpython/rlib/test/test_rsiphash.py
+++ b/rpython/rlib/test/test_rsiphash.py
@@ -1,5 +1,5 @@
 from rpython.rlib.rsiphash import siphash24, choosen_seed
-from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem import llmemory, rffi
 
 
 CASES = [
@@ -32,8 +32,8 @@
     q = rffi.str2charp('?' + s)
     with choosen_seed(0x8a9f065a358479f4, 0x11cb1e9ee7f40e1f,
                       test_misaligned_path=True):
-        x = siphash24(p, len(s))
-        y = siphash24(rffi.ptradd(q, 1), len(s))
+        x = siphash24(llmemory.cast_ptr_to_adr(p), len(s))
+        y = siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
     rffi.free_charp(p)
     rffi.free_charp(q)
     assert x == y
diff --git a/rpython/rtyper/lltypesystem/rbytearray.py b/rpython/rtyper/lltypesystem/rbytearray.py
--- a/rpython/rtyper/lltypesystem/rbytearray.py
+++ b/rpython/rtyper/lltypesystem/rbytearray.py
@@ -8,10 +8,10 @@
 def mallocbytearray(size):
     return lltype.malloc(BYTEARRAY, size)
 
-_, _, copy_bytearray_contents = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY,
+_, _, copy_bytearray_contents, _ = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY,
                                                          lltype.Char,
                                                          'bytearray')
-_, _, copy_bytearray_contents_from_str = rstr._new_copy_contents_fun(rstr.STR,
+_, _, copy_bytearray_contents_from_str, _ = rstr._new_copy_contents_fun(rstr.STR,
                                                                   BYTEARRAY,
                                                                   lltype.Char,
                                                                   'bytearray_from_str')
diff --git a/rpython/rtyper/lltypesystem/rstr.py b/rpython/rtyper/lltypesystem/rstr.py
--- a/rpython/rtyper/lltypesystem/rstr.py
+++ b/rpython/rtyper/lltypesystem/rstr.py
@@ -3,7 +3,7 @@
 from rpython.annotator import model as annmodel
 from rpython.rlib import jit, types
 from rpython.rlib.objectmodel import (malloc_zero_filled, we_are_translated,
-    _hash_string, keepalive_until_here, specialize, enforceargs)
+    ll_hash_string, keepalive_until_here, specialize, enforceargs)
 from rpython.rlib.signature import signature
 from rpython.rlib.rarithmetic import ovfcheck
 from rpython.rtyper.error import TyperError
@@ -136,15 +136,19 @@
     copy_raw_to_string = func_with_new_name(copy_raw_to_string,
                                               'copy_raw_to_%s' % name)
 
-    return copy_string_to_raw, copy_raw_to_string, copy_string_contents
+    return (copy_string_to_raw, copy_raw_to_string, copy_string_contents,
+            _get_raw_buf)
 
 (copy_string_to_raw,
  copy_raw_to_string,
- copy_string_contents) = _new_copy_contents_fun(STR, STR, Char, 'string')
+ copy_string_contents,
+ _get_raw_buf_string) = _new_copy_contents_fun(STR, STR, Char, 'string')
 
 (copy_unicode_to_raw,
  copy_raw_to_unicode,
- copy_unicode_contents) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, 'unicode')
+ copy_unicode_contents,
+ _get_raw_buf_unicode) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar,
+                                                'unicode')
 
 CONST_STR_CACHE = WeakValueDictionary()
 CONST_UNICODE_CACHE = WeakValueDictionary()
@@ -382,7 +386,7 @@
         # but our malloc initializes the memory to zero, so we use zero as the
         # special non-computed-yet value.  Also, jit.conditional_call_elidable
         # always checks for zero, for now.
-        x = _hash_string(s.chars)
+        x = ll_hash_string(s)
         if x == 0:
             x = 29872897
         s.hash = x
diff --git a/rpython/translator/c/test/test_typed.py b/rpython/translator/c/test/test_typed.py
--- a/rpython/translator/c/test/test_typed.py
+++ b/rpython/translator/c/test/test_typed.py
@@ -1,8 +1,12 @@
 from __future__ import with_statement
 
 import math
-import sys
+import sys, os
 
+if __name__ == '__main__':
+    # hack for test_hash_string_siphash24()
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__),
+                                    '..', '..', '..', '..'))
 import py
 
 from rpython.rlib.rstackovf import StackOverflow
@@ -597,6 +601,40 @@
         assert res[3] == compute_hash(d)
         assert res[4] == compute_hash(("Hi", None, (7.5, 2, d)))
 
+    def _test_hash_string(self, algo):
+        from rpython.rlib import objectmodel
+        objectmodel.set_hash_algorithm(algo)
+        s = "hello"
+        u = u"world"
+        hash_s = compute_hash(s)
+        hash_u = compute_hash(u)
+        #
+        def fn(length):
+            assert length >= 1
+            return str((compute_hash(s),
+                        compute_hash(u),
+                        compute_hash(s[0] + s[1:length]),
+                        compute_hash(u[0] + u[1:length])))
+
+        assert fn(5) == str((hash_s, hash_u, hash_s, hash_u))
+
+        f = self.getcompiled(fn, [int])
+        res = f(5)
+        res = [int(a) for a in res[1:-1].split(",")]
+        assert res[0] == hash_s
+        assert res[1] == hash_u
+        assert res[2] == hash_s
+        assert res[3] == hash_u
+
+    def test_hash_string_rpython(self):
+        self._test_hash_string("rpython")
+
+    def test_hash_string_siphash24(self):
+        import subprocess
+        subprocess.check_call([sys.executable, __file__, "siphash24",
+                               self.__class__.__module__,
+                               self.__class__.__name__])
+
     def test_list_basic_ops(self):
         def list_basic_ops(i, j):
             l = [1, 2, 3]
@@ -896,3 +934,11 @@
         f = self.getcompiled(func, [int])
         res = f(2)
         assert res == 1     # and not 2
+
+
+if __name__ == '__main__':
+    # for test_hash_string_siphash24()
+    algo, clsmodule, clsname = sys.argv[1:]
+    mod = __import__(clsmodule, None, None, [clsname])
+    cls = getattr(mod, clsname)
+    cls()._test_hash_string(algo)


More information about the pypy-commit mailing list