[pypy-commit] pypy rpython-hash: in-progress: generate the random seed at runtime

Sun Jan 29 12:51:05 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: rpython-hash
Changeset: r89828:746716a29381
Date: 2017-01-29 18:10 +0100
http://bitbucket.org/pypy/pypy/changeset/746716a29381/

Log:	in-progress: generate the random seed at runtime

diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -519,21 +519,11 @@
 
 # ----------
 
-HASH_ALGORITHM = "rpython"    # the default, no source of randomness possible
-HASH_ALGORITHM_FIXED = False
-
- at not_rpython
-def set_hash_algorithm(algo):
-    """Must be called very early, before any string is hashed with
-    compute_hash()!"""
-    global HASH_ALGORITHM
-    if HASH_ALGORITHM != algo:
-        assert not HASH_ALGORITHM_FIXED, "compute_hash() already called!"
-        assert algo in ("rpython", "siphash24")
-        HASH_ALGORITHM = algo
-
-
-def _hash_string_rpython(s):
+def _hash_string(s):
+    """The default algorithm behind compute_hash() for a string or a unicode.
+    There is a mechanism to use another one in programs after translation.
+    See rsiphash.py, which implements the algorithm of CPython >= 3.4.
+    """
     from rpython.rlib.rarithmetic import intmask
 
     length = len(s)
@@ -547,100 +537,8 @@
     x ^= length
     return intmask(x)
 
-
- at not_rpython
-def _hash_string_siphash24(s):
-    """This version is called when untranslated only."""
-    import array
-    from rpython.rlib.rsiphash import siphash24
-    from rpython.rtyper.lltypesystem import lltype, rffi
-    from rpython.rlib.rarithmetic import intmask
-
-    if not isinstance(s, str):
-        if isinstance(s, unicode):
-            lst = map(ord, s)
-        else:
-            lst = map(ord, s.chars)    # for rstr.STR or UNICODE
-        # NOTE: a latin-1 unicode string must have the same hash as the
-        # corresponding byte string.
-        if all(n <= 0xFF for n in lst):
-            kind = "B"
-        elif rffi.sizeof(lltype.UniChar) == 4:
-            kind = "I"
-        else:
-            kind = "H"
-        s = array.array(kind, lst).tostring()
-    ptr = rffi.str2charp(s)
-    x = siphash24(ptr, len(s))
-    rffi.free_charp(ptr)
-    return intmask(x)
-
-def ll_hash_string_siphash24(ll_s):
-    """Called from lltypesystem/rstr.py.  'll_s' is a rstr.STR or UNICODE."""
-    from rpython.rlib.rsiphash import siphash24
-    from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
-    from rpython.rlib.rarithmetic import intmask
-
-    length = len(ll_s.chars)
-    if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:
-        # no GC operation from here!
-        addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
-    else:
-        # NOTE: a latin-1 unicode string must have the same hash as the
-        # corresponding byte string.  If the unicode is all within
-        # 0-255, then we need to allocate a byte buffer and copy the
-        # latin-1 encoding in it manually.
-        for i in range(length):
-            if ord(ll_s.chars[i]) > 0xFF:
-                # no GC operation from here!
-                addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
-                length *= rffi.sizeof(rstr.UNICODE.chars.OF)
-                break
-        else:
-            p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
-            i = 0
-            while i < length:
-                p[i] = chr(ord(ll_s.chars[i]))
-                i += 1
-            x = siphash24(llmemory.cast_ptr_to_adr(p), length)
-            lltype.free(p, flavor='raw')
-            return intmask(x)
-    x = siphash24(addr, length)
-    keepalive_until_here(ll_s)
-    return intmask(x)
-ll_hash_string_siphash24._jit_look_inside_ = False
-
-
- at not_rpython
-def _hash_string(s):
-    """The algorithm behind compute_hash() for a string or a unicode.
-    This version is only for untranslated usage, and 's' is a str or unicode.
-    """
-    global HASH_ALGORITHM_FIXED
-    HASH_ALGORITHM_FIXED = True
-    if HASH_ALGORITHM == "rpython":
-        return _hash_string_rpython(s)
-    if HASH_ALGORITHM == "siphash24":
-        return _hash_string_siphash24(s)
-    raise NotImplementedError
-
 def ll_hash_string(ll_s):
-    """The algorithm behind compute_hash() for a string or a unicode.
-    This version is called from lltypesystem/rstr.py, and 'll_s' is a
-    rstr.STR or rstr.UNICODE.
-    """
-    if not we_are_translated():
-        global HASH_ALGORITHM_FIXED
-        HASH_ALGORITHM_FIXED = True
-    if HASH_ALGORITHM == "rpython":
-        return _hash_string_rpython(ll_s.chars)
-    if HASH_ALGORITHM == "siphash24":
-        if we_are_translated():
-            return ll_hash_string_siphash24(ll_s)
-        else:
-            return _hash_string_siphash24(ll_s)
-    raise NotImplementedError
-
+    return _hash_string(ll_s.chars)
 
 def _hash_float(f):
     """The algorithm behind compute_hash() for a float.
@@ -698,6 +596,21 @@
         return hop.gendirectcall(ll_fn, v_obj)
 
 class Entry(ExtRegistryEntry):
+    _about_ = ll_hash_string
+    # this is only used when annotating the code in rstr.py, and so
+    # it always occurs after the RPython program signalled its intent
+    # to use a different hash.  The code below overwrites the use of
+    # ll_hash_string() to make the annotator think a possibly different
+    # function was called.
+
+    def compute_annotation(self):
+        from rpython.annotator import model as annmodel
+        bk = self.bookkeeper
+        translator = bk.annotator.translator
+        fn = getattr(translator, 'll_hash_string', ll_hash_string)
+        return annmodel.SomePBC([bk.getdesc(fn)])
+
+class Entry(ExtRegistryEntry):
     _about_ = compute_identity_hash
 
     def compute_result_annotation(self, s_x):
diff --git a/rpython/rlib/rsiphash.py b/rpython/rlib/rsiphash.py
--- a/rpython/rlib/rsiphash.py
+++ b/rpython/rlib/rsiphash.py
@@ -1,13 +1,21 @@
+"""
+This module implements siphash-2-4, the hashing algorithm for strings
+and unicodes.  You can use it explicitly by calling siphash24() with
+a byte string, or you can use enable_siphash24() to enable the use
+of siphash-2-4 on all RPython strings and unicodes in your program
+after translation.
+"""
 import sys, os
 from contextlib import contextmanager
 from rpython.rlib import rarithmetic, rurandom
 from rpython.rlib.objectmodel import not_rpython, always_inline
 from rpython.rlib.objectmodel import we_are_translated, dont_inline
-from rpython.rlib.rgc import no_collect
+from rpython.rlib import rgc, jit
 from rpython.rlib.rarithmetic import r_uint64, r_uint32, r_uint
 from rpython.rlib.rawstorage import misaligned_is_fine
-from rpython.rtyper.lltypesystem import lltype, llmemory, rffi
+from rpython.rtyper.lltypesystem import lltype, llmemory, rffi, rstr
 from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.extregistry import ExtRegistryEntry
 
 
 if sys.byteorder == 'little':
@@ -19,18 +27,23 @@
 
 class Seed:
     k0l = k1l = r_uint64(0)
-    initialized = False
 seed = Seed()
 
 
+def _decode64(s):
+    return (r_uint64(ord(s[0])) |
+            r_uint64(ord(s[1])) << 8 |
+            r_uint64(ord(s[2])) << 16 |
+            r_uint64(ord(s[3])) << 24 |
+            r_uint64(ord(s[4])) << 32 |
+            r_uint64(ord(s[5])) << 40 |
+            r_uint64(ord(s[6])) << 48 |
+            r_uint64(ord(s[7])) << 56)
+
 def select_random_seed(s):
     """'s' is a string of length 16"""
-    seed.k0l = (
-      ord(s[0]) | ord(s[1]) << 8 | ord(s[2]) << 16 | ord(s[3]) << 24 |
-      ord(s[4]) << 32 | ord(s[5]) << 40 | ord(s[6]) << 48 | ord(s[7]) << 56)
-    seed.k1l = (
-      ord(s[8]) | ord(s[9]) << 8 | ord(s[10]) << 16 | ord(s[11]) << 24 |
-      ord(s[12]) << 32 | ord(s[13]) << 40 | ord(s[14]) << 48 | ord(s[15]) << 56)
+    seed.k0l = _decode64(s)
+    seed.k1l = _decode64(s[8:16])
 
 
 random_ctx = rurandom.init_urandom()
@@ -63,20 +76,85 @@
 
 env_var_name = "PYTHONHASHSEED"
 
- at dont_inline
 def initialize_from_env():
     # This uses the same algorithms as CPython 3.5.  The environment
     # variable we read also defaults to "PYTHONHASHSEED".  If needed,
     # a different RPython interpreter can patch the value of the
-    # global variable 'env_var_name', or completely patch this function
-    # with a different one.
+    # global variable 'env_var_name', or just pass a different init
+    # function to enable_siphash24().
     value = os.environ.get(env_var_name)
     if len(value) > 0 and value != "random":
         s = lcg_urandom(value)
     else:
         s = rurandom.urandom(random_ctx, 16)
     select_random_seed(s)
-    seed.initialized = True
+
+_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
+
+def enable_siphash24(*init):
+    """
+    Enable the use of siphash-2-4 for all RPython strings and unicodes
+    in the translated program.  You must call this function anywhere
+    from your interpreter (from a place that is annotated).  Optionally,
+    you can pass a function to call to initialize the state; the default
+    is 'initialize_from_env' above.  Don't call this more than once.
+    """
+    _internal_enable_siphash24()
+    if init:
+        (init_func,) = init
+    else:
+        init_func = initialize_from_env
+    llop.call_at_startup(lltype.Void, llexternal(_FUNC, init_func))
+
+def _internal_enable_siphash24():
+    pass
+
+class Entry(ExtRegistryEntry):
+    _about_ = _internal_enable_siphash24
+
+    def compute_result_annotation(self):
+        translator = self.bookkeeper.annotator.translator
+        if hasattr(translator, 'll_hash_string'):
+            assert translator.ll_hash_string == ll_hash_string_siphash24
+        else:
+            translator.ll_hash_string = ll_hash_string_siphash24
+
+    def specialize_call(self, hop):
+        hop.exception_cannot_occur()
+
+ at rgc.no_collect
+def ll_hash_string_siphash24(ll_s):
+    """Called indirectly from lltypesystem/rstr.py, by redirection from
+    objectmodel.ll_string_hash().
+    """
+    from rpython.rlib.rarithmetic import intmask
+
+    # This function is entirely @rgc.no_collect.
+    length = len(ll_s.chars)
+    if lltype.typeOf(ll_s).TO.chars.OF == lltype.Char:   # regular STR
+        addr = rstr._get_raw_buf_string(rstr.STR, ll_s, 0)
+    else:
+        # NOTE: a latin-1 unicode string must have the same hash as the
+        # corresponding byte string.  If the unicode is all within
+        # 0-255, then we need to allocate a byte buffer and copy the
+        # latin-1 encoding in it manually.
+        for i in range(length):
+            if ord(ll_s.chars[i]) > 0xFF:
+                addr = rstr._get_raw_buf_unicode(rstr.UNICODE, ll_s, 0)
+                length *= rffi.sizeof(rstr.UNICODE.chars.OF)
+                break
+        else:
+            p = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
+            i = 0
+            while i < length:
+                p[i] = chr(ord(ll_s.chars[i]))
+                i += 1
+            x = _siphash24(llmemory.cast_ptr_to_adr(p), length)
+            lltype.free(p, flavor='raw')
+            return intmask(x)
+    x = _siphash24(addr, length)
+    keepalive_until_here(ll_s)
+    return intmask(x)
 
 
 @contextmanager
@@ -123,13 +201,11 @@
     return v0, v1, v2, v3
 
 
- at no_collect
-def siphash24(addr_in, size):
+ at rgc.no_collect
+def _siphash24(addr_in, size):
     """Takes an address pointer and a size.  Returns the hash as a r_uint64,
     which can then be casted to the expected type."""
 
-    if we_are_translated() and not seed.initialized:
-        initialize_from_env()
     k0 = seed.k0l
     k1 = seed.k1l
     b = r_uint64(size) << 56
@@ -206,3 +282,13 @@
     v0, v1, v2, v3 = _double_round(v0, v1, v2, v3)
 
     return (v0 ^ v1) ^ (v2 ^ v3)
+
+
+ at jit.dont_look_inside
+def siphash24(s):
+    """'s' is a normal string.  Returns its siphash-2-4 as a r_uint64.
+    Don't forget to cast the result to a regular integer if needed,
+    e.g. with rarithmetic.intmask().
+    """
+    with rffi.scoped_nonmovingbuffer(s) as p:
+        return _siphash24(llmemory.cast_ptr_to_adr(p), len(s))
diff --git a/rpython/rlib/test/test_rsiphash.py b/rpython/rlib/test/test_rsiphash.py
--- a/rpython/rlib/test/test_rsiphash.py
+++ b/rpython/rlib/test/test_rsiphash.py
@@ -1,5 +1,5 @@
 import os
-from rpython.rlib.rsiphash import siphash24, choosen_seed
+from rpython.rlib.rsiphash import siphash24, _siphash24, choosen_seed
 from rpython.rlib.rsiphash import initialize_from_env, seed
 from rpython.rtyper.lltypesystem import llmemory, rffi
 
@@ -30,13 +30,11 @@
 ]
 
 def check(s):
-    p = rffi.str2charp(s)
     q = rffi.str2charp('?' + s)
     with choosen_seed(0x8a9f065a358479f4, 0x11cb1e9ee7f40e1f,
                       test_misaligned_path=True):
-        x = siphash24(llmemory.cast_ptr_to_adr(p), len(s))
-        y = siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
-    rffi.free_charp(p)
+        x = siphash24(s)
+        y = _siphash24(llmemory.cast_ptr_to_adr(rffi.ptradd(q, 1)), len(s))
     rffi.free_charp(q)
     assert x == y
     return x
@@ -46,22 +44,27 @@
         assert check(string) == expected
 
 def test_fix_seed():
-    p = rffi.str2charp("foo")
-    adr = llmemory.cast_ptr_to_adr(p)
+    old_val = os.environ.get('PYTHONHASHSEED', None)
+    try:
+        os.environ['PYTHONHASHSEED'] = '0'
+        initialize_from_env()
+        assert siphash24("foo") == 15988776847138518036
+        # value checked with CPython 3.5
 
-    os.environ['PYTHONHASHSEED'] = '0'
-    initialize_from_env()
-    assert siphash24(adr, 3) == 15988776847138518036 # checked with CPython 3.5
+        os.environ['PYTHONHASHSEED'] = '123'
+        initialize_from_env()
+        assert siphash24("foo") == 12577370453467666022
+        # value checked with CPython 3.5
 
-    os.environ['PYTHONHASHSEED'] = '123'
-    initialize_from_env()
-    assert siphash24(adr, 3) == 12577370453467666022 # checked with CPython 3.5
-
-    os.environ['PYTHONHASHSEED'] = 'random'
-    initialize_from_env()
-    hash1 = siphash24(adr, 3)
-    initialize_from_env()
-    hash2 = siphash24(adr, 3)
-    assert hash1 != hash2
-
-    rffi.free_charp(p)
+        for env in ['', 'random']:
+            os.environ['PYTHONHASHSEED'] = env
+            initialize_from_env()
+            hash1 = siphash24("foo")
+            initialize_from_env()
+            hash2 = siphash24("foo")
+            assert hash1 != hash2     # extremely unlikely
+    finally:
+        if old_val is None:
+            del os.environ['PYTHONHASHSEED']
+        else:
+            os.environ['PYTHONHASHSEED'] = old_val
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -829,7 +829,7 @@
         return assert_str0(charpsize2str(cp, size))
     charp2str._annenforceargs_ = [lltype.SomePtr(TYPEP)]
 
-    # str -> char*, bool, bool
+    # str -> char*, flag
     # Can't inline this because of the raw address manipulation.
     @jit.dont_look_inside
     def get_nonmovingbuffer(data):
diff --git a/rpython/rtyper/lltypesystem/rstr.py b/rpython/rtyper/lltypesystem/rstr.py
--- a/rpython/rtyper/lltypesystem/rstr.py
+++ b/rpython/rtyper/lltypesystem/rstr.py
@@ -169,10 +169,7 @@
             for i in range(len(value)):
                 p.chars[i] = cast_primitive(self.base, value[i])
             p.hash = 0
-            if objectmodel.HASH_ALGORITHM == "rpython":
-                self.ll.ll_strhash(p)   # precompute the hash
-                # but it is pointless if this hash wouldn't end up in the
-                # C code anyway: see "remove_hash" in translator/c/node.py
+            self.ll.ll_strhash(p)   # precompute the hash
             self.CACHE[value] = p
             return p
 
diff --git a/rpython/translator/c/node.py b/rpython/translator/c/node.py
--- a/rpython/translator/c/node.py
+++ b/rpython/translator/c/node.py
@@ -586,8 +586,10 @@
             data.append((name, getattr(self.obj, name)))
 
         if T._hints.get('remove_hash'):
-            # hack for rstr.STR and UNICODE
-            if objectmodel.HASH_ALGORITHM != "rpython":
+            # hack for rstr.STR and UNICODE: remove their .hash value
+            # and write 0 in the C sources, if we're using a non-default
+            # hash function.
+            if hasattr(self.db.translator, 'll_hash_string'):
                 i = 0
                 while data[i][0] != 'hash':
                     i += 1