[pypy-commit] pypy default: Implement Siphash-2-4, the same hashing function as CPython 3.x.
arigo
pypy.commits at gmail.com
Wed Jan 25 14:51:46 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r89771:134351c66421
Date: 2017-01-25 20:51 +0100
http://bitbucket.org/pypy/pypy/changeset/134351c66421/
Log: Implement Siphash-2-4, the same hashing function as CPython 3.x.
Disabled by default.
diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -520,10 +520,22 @@
# ----------
HASH_ALGORITHM = "rpython" # XXX Is there a better name?
+HASH_ALGORITHM_FIXED = False
-def _hash_string(s):
- """The algorithm behind compute_hash() for a string or a unicode."""
+ at not_rpython
+def set_hash_algorithm(algo):
+ """Must be called very early, before any string is hashed with
+ compute_hash()!"""
+ global HASH_ALGORITHM
+ if HASH_ALGORITHM != algo:
+ assert not HASH_ALGORITHM_FIXED, "compute_hash() already called!"
+ assert algo in ("rpython", "siphash24")
+ HASH_ALGORITHM = algo
+
+
+def _hash_string_rpython(s):
from rpython.rlib.rarithmetic import intmask
+
length = len(s)
if length == 0:
return -1
@@ -535,6 +547,83 @@
x ^= length
return intmask(x)
+
+ at not_rpython
+def _hash_string_siphash24(s):
+ """This version is called when untranslated only."""
+ import array
+ from rpython.rlib.rsiphash import siphash24
+ from rpython.rtyper.lltypesystem import lltype, rffi
+ from rpython.rlib.rarithmetic import intmask
+
+ if isinstance(s, str):
+ pass
+ elif isinstance(s, unicode):
+ if rffi.sizeof(lltype.UniChar) == 4:
+ kind = "I"
+ else:
+ kind = "H"
+ s = array.array(kind, map(ord, s)).tostring()
+ else:
+ if lltype.typeOf(s).TO.chars.OF == lltype.Char:
+ kind = "B"
+ elif rffi.sizeof(lltype.UniChar) == 4:
+ kind = "I"
+ else:
+ kind = "H"
+ s = array.array(kind, map(ord, s.chars)).tostring()
+ ptr = rffi.str2charp(s)
+ x = siphash24(ptr, len(s))
+ rffi.free_charp(ptr)
+ return intmask(x)
+
+def ll_hash_string_siphash24(ll_s):
+ """Called from lltypesystem/rstr.py. 'll_s' is a rstr.STR or UNICODE."""
+ from rpython.rlib.rsiphash import siphash24
+ from rpython.rtyper.lltypesystem import lltype, rffi, rstr
+ from rpython.rlib.rarithmetic import intmask
+
+ length = len(ll_s.chars)
+ # no GC operation from here!
+ if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
+ addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
+ else:
+ addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+ length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+ x = siphash24(addr, length)
+ keepalive_until_here(ll_s)
+ return intmask(x)
+ll_hash_string_siphash24._jit_look_inside_ = False
+
+
+ at not_rpython
+def _hash_string(s):
+ """The algorithm behind compute_hash() for a string or a unicode.
+ This version is only for untranslated usage, and 's' is a str or unicode.
+ """
+ global HASH_ALGORITHM_FIXED
+ HASH_ALGORITHM_FIXED = True
+ if HASH_ALGORITHM == "rpython":
+ return _hash_string_rpython(s)
+ if HASH_ALGORITHM == "siphash24":
+ return _hash_string_siphash24(s)
+ raise NotImplementedError
+
+def ll_hash_string(ll_s):
+ """The algorithm behind compute_hash() for a string or a unicode.
+ This version is called from lltypesystem/rstr.py, and 'll_s' is a
+ rstr.STR or rstr.UNICODE.
+ """
+ if HASH_ALGORITHM == "rpython":
+ return _hash_string_rpython(ll_s.chars)
+ if HASH_ALGORITHM == "siphash24":
+ if we_are_translated():
+ return ll_hash_string_siphash24(ll_s)
+ else:
+ return _hash_string_siphash24(ll_s)
+ raise NotImplementedError
+
+
def _hash_float(f):
"""The algorithm behind compute_hash() for a float.
This implementation is identical to the CPython implementation,
diff --git a/rpython/rlib/rsiphash.py b/rpython/rlib/rsiphash.py
--- a/rpython/rlib/rsiphash.py
+++ b/rpython/rlib/rsiphash.py
@@ -2,6 +2,7 @@
from contextlib import contextmanager
from rpython.rlib import rarithmetic
from rpython.rlib.objectmodel import not_rpython, always_inline
+from rpython.rlib.rgc import no_collect
from rpython.rlib.rarithmetic import r_uint64
from rpython.rlib.rawstorage import misaligned_is_fine
from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
@@ -73,11 +74,11 @@
return v0, v1, v2, v3
-def siphash24(ptr, size):
- """Takes a CCHARP pointer and a size. Returns the hash as a r_uint64,
+ at no_collect
+def siphash24(addr_in, size):
+ """Takes an address pointer and a size. Returns the hash as a r_uint64,
which can then be casted to the expected type."""
- addr_in = llmemory.cast_ptr_to_adr(ptr)
direct = (misaligned_is_fine or
(rffi.cast(lltype.Signed, addr_in) & 7) == 0)
diff --git a/rpython/rlib/test/test_rsiphash.py b/rpython/rlib/test/test_rsiphash.py
--- a/rpython/rlib/test/test_rsiphash.py
+++ b/rpython/rlib/test/test_rsiphash.py
@@ -1,5 +1,5 @@
from rpython.rlib.rsiphash import siphash24, choosen_seed
-from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem import llmemory, rffi
CASES = [
@@ -32,8 +32,8 @@
q = rffi.str2charp('?' + s)
with choosen_seed(0x8a9f065a358479f4, 0x11cb1e9ee7f40e1f,
test_misaligned_path=True):
- x = siphash24(p, len(s))
- y = siphash24(rffi.ptradd(q, 1), len(s))
+ x = siphash24(llmemory.cast_ptr_to_adr(p), len(s))
+ y = siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
rffi.free_charp(p)
rffi.free_charp(q)
assert x == y
diff --git a/rpython/rtyper/lltypesystem/rbytearray.py b/rpython/rtyper/lltypesystem/rbytearray.py
--- a/rpython/rtyper/lltypesystem/rbytearray.py
+++ b/rpython/rtyper/lltypesystem/rbytearray.py
@@ -8,10 +8,10 @@
def mallocbytearray(size):
return lltype.malloc(BYTEARRAY, size)
-_, _, copy_bytearray_contents = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY,
+_, _, copy_bytearray_contents, _ = rstr._new_copy_contents_fun(BYTEARRAY, BYTEARRAY,
lltype.Char,
'bytearray')
-_, _, copy_bytearray_contents_from_str = rstr._new_copy_contents_fun(rstr.STR,
+_, _, copy_bytearray_contents_from_str, _ = rstr._new_copy_contents_fun(rstr.STR,
BYTEARRAY,
lltype.Char,
'bytearray_from_str')
diff --git a/rpython/rtyper/lltypesystem/rstr.py b/rpython/rtyper/lltypesystem/rstr.py
--- a/rpython/rtyper/lltypesystem/rstr.py
+++ b/rpython/rtyper/lltypesystem/rstr.py
@@ -3,7 +3,7 @@
from rpython.annotator import model as annmodel
from rpython.rlib import jit, types
from rpython.rlib.objectmodel import (malloc_zero_filled, we_are_translated,
- _hash_string, keepalive_until_here, specialize, enforceargs)
+ ll_hash_string, keepalive_until_here, specialize, enforceargs)
from rpython.rlib.signature import signature
from rpython.rlib.rarithmetic import ovfcheck
from rpython.rtyper.error import TyperError
@@ -136,15 +136,19 @@
copy_raw_to_string = func_with_new_name(copy_raw_to_string,
'copy_raw_to_%s' % name)
- return copy_string_to_raw, copy_raw_to_string, copy_string_contents
+ return (copy_string_to_raw, copy_raw_to_string, copy_string_contents,
+ _get_raw_buf)
(copy_string_to_raw,
copy_raw_to_string,
- copy_string_contents) = _new_copy_contents_fun(STR, STR, Char, 'string')
+ copy_string_contents,
+ _get_raw_buf_string) = _new_copy_contents_fun(STR, STR, Char, 'string')
(copy_unicode_to_raw,
copy_raw_to_unicode,
- copy_unicode_contents) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar, 'unicode')
+ copy_unicode_contents,
+ _get_raw_buf_unicode) = _new_copy_contents_fun(UNICODE, UNICODE, UniChar,
+ 'unicode')
CONST_STR_CACHE = WeakValueDictionary()
CONST_UNICODE_CACHE = WeakValueDictionary()
@@ -382,7 +386,7 @@
# but our malloc initializes the memory to zero, so we use zero as the
# special non-computed-yet value. Also, jit.conditional_call_elidable
# always checks for zero, for now.
- x = _hash_string(s.chars)
+ x = ll_hash_string(s)
if x == 0:
x = 29872897
s.hash = x
diff --git a/rpython/translator/c/test/test_typed.py b/rpython/translator/c/test/test_typed.py
--- a/rpython/translator/c/test/test_typed.py
+++ b/rpython/translator/c/test/test_typed.py
@@ -1,8 +1,12 @@
from __future__ import with_statement
import math
-import sys
+import sys, os
+if __name__ == '__main__':
+ # hack for test_hash_string_siphash24()
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__),
+ '..', '..', '..', '..'))
import py
from rpython.rlib.rstackovf import StackOverflow
@@ -597,6 +601,40 @@
assert res[3] == compute_hash(d)
assert res[4] == compute_hash(("Hi", None, (7.5, 2, d)))
+ def _test_hash_string(self, algo):
+ from rpython.rlib import objectmodel
+ objectmodel.set_hash_algorithm(algo)
+ s = "hello"
+ u = u"world"
+ hash_s = compute_hash(s)
+ hash_u = compute_hash(u)
+ #
+ def fn(length):
+ assert length >= 1
+ return str((compute_hash(s),
+ compute_hash(u),
+ compute_hash(s[0] + s[1:length]),
+ compute_hash(u[0] + u[1:length])))
+
+ assert fn(5) == str((hash_s, hash_u, hash_s, hash_u))
+
+ f = self.getcompiled(fn, [int])
+ res = f(5)
+ res = [int(a) for a in res[1:-1].split(",")]
+ assert res[0] == hash_s
+ assert res[1] == hash_u
+ assert res[2] == hash_s
+ assert res[3] == hash_u
+
+ def test_hash_string_rpython(self):
+ self._test_hash_string("rpython")
+
+ def test_hash_string_siphash24(self):
+ import subprocess
+ subprocess.check_call([sys.executable, __file__, "siphash24",
+ self.__class__.__module__,
+ self.__class__.__name__])
+
def test_list_basic_ops(self):
def list_basic_ops(i, j):
l = [1, 2, 3]
@@ -896,3 +934,11 @@
f = self.getcompiled(func, [int])
res = f(2)
assert res == 1 # and not 2
+
+
+if __name__ == '__main__':
+ # for test_hash_string_siphash24()
+ algo, clsmodule, clsname = sys.argv[1:]
+ mod = __import__(clsmodule, None, None, [clsname])
+ cls = getattr(mod, clsname)
+ cls()._test_hash_string(algo)
More information about the pypy-commit
mailing list