[pypy-commit] pypy rpython-hash: in-progress: generate the random seed at runtime
arigo
pypy.commits at gmail.com
Sun Jan 29 12:51:05 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: rpython-hash
Changeset: r89828:746716a29381
Date: 2017-01-29 18:10 +0100
http://bitbucket.org/pypy/pypy/changeset/746716a29381/
Log: in-progress: generate the random seed at runtime
diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -519,21 +519,11 @@
# ----------
-HASH_ALGORITHM = "rpython" # the default, no source of randomness possible
-HASH_ALGORITHM_FIXED = False
-
- at not_rpython
-def set_hash_algorithm(algo):
- """Must be called very early, before any string is hashed with
- compute_hash()!"""
- global HASH_ALGORITHM
- if HASH_ALGORITHM != algo:
- assert not HASH_ALGORITHM_FIXED, "compute_hash() already called!"
- assert algo in ("rpython", "siphash24")
- HASH_ALGORITHM = algo
-
-
-def _hash_string_rpython(s):
+def _hash_string(s):
+ """The default algorithm behind compute_hash() for a string or a unicode.
+ There is a mechanism to use another one in programs after translation.
+ See rsiphash.py, which implements the algorithm of CPython >= 3.4.
+ """
from rpython.rlib.rarithmetic import intmask
length = len(s)
@@ -547,100 +537,8 @@
x ^= length
return intmask(x)
-
- at not_rpython
-def _hash_string_siphash24(s):
- """This version is called when untranslated only."""
- import array
- from rpython.rlib.rsiphash import siphash24
- from rpython.rtyper.lltypesystem import lltype, rffi
- from rpython.rlib.rarithmetic import intmask
-
- if not isinstance(s, str):
- if isinstance(s, unicode):
- lst = map(ord, s)
- else:
- lst = map(ord, s.chars) # for rstr.STR or UNICODE
- # NOTE: a latin-1 unicode string must have the same hash as the
- # corresponding byte string.
- if all(n <= 0xFF for n in lst):
- kind = "B"
- elif rffi.sizeof(lltype.UniChar) == 4:
- kind = "I"
- else:
- kind = "H"
- s = array.array(kind, lst).tostring()
- ptr = rffi.str2charp(s)
- x = siphash24(ptr, len(s))
- rffi.free_charp(ptr)
- return intmask(x)
-
-def ll_hash_string_siphash24(ll_s):
- """Called from lltypesystem/rstr.py. 'll_s' is a rstr.STR or UNICODE."""
- from rpython.rlib.rsiphash import siphash24
- from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
- from rpython.rlib.rarithmetic import intmask
-
- length = len(ll_s.chars)
- if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
- # no GC operation from here!
- addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
- else:
- # NOTE: a latin-1 unicode string must have the same hash as the
- # corresponding byte string. If the unicode is all within
- # 0-255, then we need to allocate a byte buffer and copy the
- # latin-1 encoding in it manually.
- for i in range(length):
- if ord(ll_s.chars[i]) > 0xFF:
- # no GC operation from here!
- addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
- length *= rffi.sizeof(rstr.UNICODE.chars.OF)
- break
- else:
- p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
- i = 0
- while i < length:
- p[i] = chr(ord(ll_s.chars[i]))
- i += 1
- x = siphash24(llmemory.cast_ptr_to_adr(p), length)
- lltype.free(p, flavor='raw')
- return intmask(x)
- x = siphash24(addr, length)
- keepalive_until_here(ll_s)
- return intmask(x)
-ll_hash_string_siphash24._jit_look_inside_ = False
-
-
- at not_rpython
-def _hash_string(s):
- """The algorithm behind compute_hash() for a string or a unicode.
- This version is only for untranslated usage, and 's' is a str or unicode.
- """
- global HASH_ALGORITHM_FIXED
- HASH_ALGORITHM_FIXED = True
- if HASH_ALGORITHM == "rpython":
- return _hash_string_rpython(s)
- if HASH_ALGORITHM == "siphash24":
- return _hash_string_siphash24(s)
- raise NotImplementedError
-
def ll_hash_string(ll_s):
- """The algorithm behind compute_hash() for a string or a unicode.
- This version is called from lltypesystem/rstr.py, and 'll_s' is a
- rstr.STR or rstr.UNICODE.
- """
- if not we_are_translated():
- global HASH_ALGORITHM_FIXED
- HASH_ALGORITHM_FIXED = True
- if HASH_ALGORITHM == "rpython":
- return _hash_string_rpython(ll_s.chars)
- if HASH_ALGORITHM == "siphash24":
- if we_are_translated():
- return ll_hash_string_siphash24(ll_s)
- else:
- return _hash_string_siphash24(ll_s)
- raise NotImplementedError
-
+ return _hash_string(ll_s.chars)
def _hash_float(f):
"""The algorithm behind compute_hash() for a float.
@@ -698,6 +596,21 @@
return hop.gendirectcall(ll_fn, v_obj)
class Entry(ExtRegistryEntry):
+ _about_ = ll_hash_string
+ # this is only used when annotating the code in rstr.py, and so
+ # it always occurs after the RPython program signalled its intent
+ # to use a different hash. The code below overwrites the use of
+ # ll_hash_string() to make the annotator think a possibly different
+ # function was called.
+
+ def compute_annotation(self):
+ from rpython.annotator import model as annmodel
+ bk = self.bookkeeper
+ translator = bk.annotator.translator
+ fn = getattr(translator, 'll_hash_string', ll_hash_string)
+ return annmodel.SomePBC([bk.getdesc(fn)])
+
+class Entry(ExtRegistryEntry):
_about_ = compute_identity_hash
def compute_result_annotation(self, s_x):
diff --git a/rpython/rlib/rsiphash.py b/rpython/rlib/rsiphash.py
--- a/rpython/rlib/rsiphash.py
+++ b/rpython/rlib/rsiphash.py
@@ -1,13 +1,21 @@
+"""
+This module implements siphash-2-4, the hashing algorithm for strings
+and unicodes. You can use it explicitly by calling siphash24() with
+a byte string, or you can use enable_siphash24() to enable the use
+of siphash-2-4 on all RPython strings and unicodes in your program
+after translation.
+"""
import sys, os
from contextlib import contextmanager
from rpython.rlib import rarithmetic, rurandom
from rpython.rlib.objectmodel import not_rpython, always_inline
from rpython.rlib.objectmodel import we_are_translated, dont_inline
-from rpython.rlib.rgc import no_collect
+from rpython.rlib import rgc, jit
from rpython.rlib.rarithmetic import r_uint64, r_uint32, r_uint
from rpython.rlib.rawstorage import misaligned_is_fine
-from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
+from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.extregistry import ExtRegistryEntry
if sys.byteorder == 'little':
@@ -19,18 +27,23 @@
class Seed:
k0l = k1l = r_uint64(0)
- initialized = False
seed = Seed()
+def _decode64(s):
+ return (r_uint64(ord(s[0])) |
+ r_uint64(ord(s[1])) << 8 |
+ r_uint64(ord(s[2])) << 16 |
+ r_uint64(ord(s[3])) << 24 |
+ r_uint64(ord(s[4])) << 32 |
+ r_uint64(ord(s[5])) << 40 |
+ r_uint64(ord(s[6])) << 48 |
+ r_uint64(ord(s[7])) << 56)
+
def select_random_seed(s):
"""'s' is a string of length 16"""
- seed.k0l = (
- ord(s[0]) | ord(s[1]) << 8 | ord(s[2]) << 16 | ord(s[3]) << 24 |
- ord(s[4]) << 32 | ord(s[5]) << 40 | ord(s[6]) << 48 | ord(s[7]) << 56)
- seed.k1l = (
- ord(s[8]) | ord(s[9]) << 8 | ord(s[10]) << 16 | ord(s[11]) << 24 |
- ord(s[12]) << 32 | ord(s[13]) << 40 | ord(s[14]) << 48 | ord(s[15]) << 56)
+ seed.k0l = _decode64(s)
+ seed.k1l = _decode64(s[8:16])
random_ctx = rurandom.init_urandom()
@@ -63,20 +76,85 @@
env_var_name = "PYTHONHASHSEED"
- at dont_inline
def initialize_from_env():
# This uses the same algorithms as CPython 3.5. The environment
# variable we read also defaults to "PYTHONHASHSEED". If needed,
# a different RPython interpreter can patch the value of the
- # global variable 'env_var_name', or completely patch this function
- # with a different one.
+ # global variable 'env_var_name', or just pass a different init
+ # function to enable_siphash24().
value = os.environ.get(env_var_name)
if len(value) > 0 and value != "random":
s = lcg_urandom(value)
else:
s = rurandom.urandom(random_ctx, 16)
select_random_seed(s)
- seed.initialized = True
+
+_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
+
+def enable_siphash24(*init):
+ """
+ Enable the use of siphash-2-4 for all RPython strings and unicodes
+ in the translated program. You must call this function anywhere
+ from your interpreter (from a place that is annotated). Optionally,
+ you can pass a function to call to initialize the state; the default
+ is 'initialize_from_env' above. Don't call this more than once.
+ """
+ _internal_enable_siphash24()
+ if init:
+ (init_func,) = init
+ else:
+ init_func = initialize_from_env
+ llop.call_at_startup(lltype.Void, llexternal(_FUNC, init_func))
+
+def _internal_enable_siphash24():
+ pass
+
+class Entry(ExtRegistryEntry):
+ _about_ = _internal_enable_siphash24
+
+ def compute_result_annotation(self):
+ translator = self.bookkeeper.annotator.translator
+ if hasattr(translator, 'll_hash_string'):
+ assert translator.ll_hash_string == ll_hash_string_siphash24
+ else:
+ translator.ll_hash_string = ll_hash_string_siphash24
+
+ def specialize_call(self, hop):
+ hop.exception_cannot_occur()
+
+ at rgc.no_collect
+def ll_hash_string_siphash24(ll_s):
+ """Called indirectly from lltypesystem/rstr.py, by redirection from
+ objectmodel.ll_string_hash().
+ """
+ from rpython.rlib.rarithmetic import intmask
+
+ # This function is entirely @rgc.no_collect.
+ length = len(ll_s.chars)
+ if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char: # regular STR
+ addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
+ else:
+ # NOTE: a latin-1 unicode string must have the same hash as the
+ # corresponding byte string. If the unicode is all within
+ # 0-255, then we need to allocate a byte buffer and copy the
+ # latin-1 encoding in it manually.
+ for i in range(length):
+ if ord(ll_s.chars[i]) > 0xFF:
+ addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+ length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+ break
+ else:
+ p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
+ i = 0
+ while i < length:
+ p[i] = chr(ord(ll_s.chars[i]))
+ i += 1
+ x = _siphash24(llmemory.cast_ptr_to_adr(p), length)
+ lltype.free(p, flavor='raw')
+ return intmask(x)
+ x = _siphash24(addr, length)
+ keepalive_until_here(ll_s)
+ return intmask(x)
@contextmanager
@@ -123,13 +201,11 @@
return v0, v1, v2, v3
- at no_collect
-def siphash24(addr_in, size):
+ at rgc.no_collect
+def _siphash24(addr_in, size):
"""Takes an address pointer and a size. Returns the hash as a r_uint64,
which can then be casted to the expected type."""
- if we_are_translated() and not seed.initialized:
- initialize_from_env()
k0 = seed.k0l
k1 = seed.k1l
b = r_uint64(size) << 56
@@ -206,3 +282,13 @@
v0, v1, v2, v3 = _double_round(v0, v1, v2, v3)
return (v0 ^ v1) ^ (v2 ^ v3)
+
+
+ at jit.dont_look_inside
+def siphash24(s):
+ """'s' is a normal string. Returns its siphash-2-4 as a r_uint64.
+ Don't forget to cast the result to a regular integer if needed,
+ e.g. with rarithmetic.intmask().
+ """
+ with rffi.scoped_nonmovingbuffer(s) as p:
+ return _siphash24(llmemory.cast_ptr_to_adr(p), len(s))
diff --git a/rpython/rlib/test/test_rsiphash.py b/rpython/rlib/test/test_rsiphash.py
--- a/rpython/rlib/test/test_rsiphash.py
+++ b/rpython/rlib/test/test_rsiphash.py
@@ -1,5 +1,5 @@
import os
-from rpython.rlib.rsiphash import siphash24, choosen_seed
+from rpython.rlib.rsiphash import siphash24, _siphash24, choosen_seed
from rpython.rlib.rsiphash import initialize_from_env, seed
from rpython.rtyper.lltypesystem import llmemory, rffi
@@ -30,13 +30,11 @@
]
def check(s):
- p = rffi.str2charp(s)
q = rffi.str2charp('?' + s)
with choosen_seed(0x8a9f065a358479f4, 0x11cb1e9ee7f40e1f,
test_misaligned_path=True):
- x = siphash24(llmemory.cast_ptr_to_adr(p), len(s))
- y = siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
- rffi.free_charp(p)
+ x = siphash24(s)
+ y = _siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
rffi.free_charp(q)
assert x == y
return x
@@ -46,22 +44,27 @@
assert check(string) == expected
def test_fix_seed():
- p = rffi.str2charp("foo")
- adr = llmemory.cast_ptr_to_adr(p)
+ old_val = os.environ.get('PYTHONHASHSEED', None)
+ try:
+ os.environ['PYTHONHASHSEED'] = '0'
+ initialize_from_env()
+ assert siphash24("foo") == 15988776847138518036
+ # value checked with CPython 3.5
- os.environ['PYTHONHASHSEED'] = '0'
- initialize_from_env()
- assert siphash24(adr, 3) == 15988776847138518036 # checked with CPython 3.5
+ os.environ['PYTHONHASHSEED'] = '123'
+ initialize_from_env()
+ assert siphash24("foo") == 12577370453467666022
+ # value checked with CPython 3.5
- os.environ['PYTHONHASHSEED'] = '123'
- initialize_from_env()
- assert siphash24(adr, 3) == 12577370453467666022 # checked with CPython 3.5
-
- os.environ['PYTHONHASHSEED'] = 'random'
- initialize_from_env()
- hash1 = siphash24(adr, 3)
- initialize_from_env()
- hash2 = siphash24(adr, 3)
- assert hash1 != hash2
-
- rffi.free_charp(p)
+ for env in ['', 'random']:
+ os.environ['PYTHONHASHSEED'] = env
+ initialize_from_env()
+ hash1 = siphash24("foo")
+ initialize_from_env()
+ hash2 = siphash24("foo")
+ assert hash1 != hash2 # extremely unlikely
+ finally:
+ if old_val is None:
+ del os.environ['PYTHONHASHSEED']
+ else:
+ os.environ['PYTHONHASHSEED'] = old_val
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -829,7 +829,7 @@
return assert_str0(charpsize2str(cp, size))
charp2str._annenforceargs_ = [lltype.SomePtr(TYPEP)]
- # str -> char*, bool, bool
+ # str -> char*, flag
# Can't inline this because of the raw address manipulation.
@jit.dont_look_inside
def get_nonmovingbuffer(data):
diff --git a/rpython/rtyper/lltypesystem/rstr.py b/rpython/rtyper/lltypesystem/rstr.py
--- a/rpython/rtyper/lltypesystem/rstr.py
+++ b/rpython/rtyper/lltypesystem/rstr.py
@@ -169,10 +169,7 @@
for i in range(len(value)):
p.chars[i] = cast_primitive(self.base, value[i])
p.hash = 0
- if objectmodel.HASH_ALGORITHM == "rpython":
- self.ll.ll_strhash(p) # precompute the hash
- # but it is pointless if this hash wouldn't end up in the
- # C code anyway: see "remove_hash" in translator/c/node.py
+ self.ll.ll_strhash(p) # precompute the hash
self.CACHE[value] = p
return p
diff --git a/rpython/translator/c/node.py b/rpython/translator/c/node.py
--- a/rpython/translator/c/node.py
+++ b/rpython/translator/c/node.py
@@ -586,8 +586,10 @@
data.append((name, getattr(self.obj, name)))
if T._hints.get('remove_hash'):
- # hack for rstr.STR and UNICODE
- if objectmodel.HASH_ALGORITHM != "rpython":
+ # hack for rstr.STR and UNICODE: remove their .hash value
+ # and write 0 in the C sources, if we're using a non-default
+ # hash function.
+ if hasattr(self.db.translator, 'll_hash_string'):
i = 0
while data[i][0] != 'hash':
i += 1
More information about the pypy-commit
mailing list