[pypy-commit] pypy default: Use custom assembler for divisions of 128 bits by 64 bits with a result

Sun Jun 26 16:21:04 EDT 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r85386:800377eb1f02
Date: 2016-06-26 15:37 +0200
http://bitbucket.org/pypy/pypy/changeset/800377eb1f02/

Log:	Use custom assembler for divisions of 128 bits by 64 bits with a
	result that fits 64 bits. It's hard to get this effect automatically
	while writing C.

diff --git a/rpython/rlib/rbigint.py b/rpython/rlib/rbigint.py
--- a/rpython/rlib/rbigint.py
+++ b/rpython/rlib/rbigint.py
@@ -1827,6 +1827,8 @@
     Divide bigint pin by non-zero digit n, storing quotient
     in pout, and returning the remainder. It's OK for pin == pout on entry.
     """
+    from rpython.rtyper.lltypesystem.lloperation import llop
+
     rem = _widen_digit(0)
     assert n > 0 and n <= MASK
     if not size:
@@ -1834,7 +1836,7 @@
     size -= 1
     while size >= 0:
         rem = (rem << SHIFT) | pin.widedigit(size)
-        hi = rem // n
+        hi = llop.long2_floordiv(lltype.Signed, rem, n)
         pout.setdigit(size, hi)
         rem -= hi * n
         size -= 1
@@ -1924,6 +1926,7 @@
     z._normalize()
     return z
 _muladd1._annspecialcase_ = "specialize:argtype(2)"
+
 def _v_lshift(z, a, m, d):
     """ Shift digit vector a[0:m] d bits left, with 0 <= d < SHIFT. Put
         * result in z[0:m], and return the d bits shifted out of the top.
@@ -1961,6 +1964,8 @@
 
 def _x_divrem(v1, w1):
     """ Unsigned bigint division with remainder -- the algorithm """
+    from rpython.rtyper.lltypesystem.lloperation import llop
+
     size_v = v1.numdigits()
     size_w = w1.numdigits()
     assert size_v >= size_w and size_w > 1
@@ -1991,6 +1996,7 @@
     assert k > 0
     a = rbigint([NULLDIGIT] * k, 1, k)
 
+    wm1s = w.digit(abs(size_w-1))
     wm1 = w.widedigit(abs(size_w-1))
     wm2 = w.widedigit(abs(size_w-2))
 
@@ -2008,7 +2014,7 @@
             vtop = v.widedigit(j)
         assert vtop <= wm1
         vv = (vtop << SHIFT) | v.widedigit(abs(j-1))
-        q = vv / wm1
+        q = llop.long2_floordiv(lltype.Signed, vv, wm1s)
         r = vv - wm1 * q
         while wm2 * q > ((r << SHIFT) | v.widedigit(abs(j-2))):
             q -= 1
diff --git a/rpython/rtyper/lltypesystem/lloperation.py b/rpython/rtyper/lltypesystem/lloperation.py
--- a/rpython/rtyper/lltypesystem/lloperation.py
+++ b/rpython/rtyper/lltypesystem/lloperation.py
@@ -319,6 +319,9 @@
     'lllong_rshift':         LLOp(canfold=True),  # args (r_longlonglong, int)
     'lllong_xor':            LLOp(canfold=True),
 
+    'long2_floordiv':       LLOp(canfold=True),  # (double-r_long, int) => int
+                                                 # (all integers signed)
+
     'cast_primitive':       LLOp(canfold=True),
     'cast_bool_to_int':     LLOp(canfold=True),
     'cast_bool_to_uint':    LLOp(canfold=True),
diff --git a/rpython/rtyper/lltypesystem/opimpl.py b/rpython/rtyper/lltypesystem/opimpl.py
--- a/rpython/rtyper/lltypesystem/opimpl.py
+++ b/rpython/rtyper/lltypesystem/opimpl.py
@@ -16,7 +16,7 @@
                         'bool': True, 'is_true':True}
 
 # global synonyms for some types
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, base_int
 from rpython.rlib.rarithmetic import r_int, r_uint, r_longlong, r_ulonglong, r_longlonglong
 from rpython.rtyper.lltypesystem.llmemory import AddressAsInt
 
@@ -733,6 +733,16 @@
     assert isinstance(x, bool)
     return x
 
+def op_long2_floordiv(x, y):
+    if lltype.typeOf(x) != lltype.Signed:
+        assert isinstance(x, base_int)
+        assert x.BITS == 2 * r_int.BITS
+        assert x.SIGNED
+    assert lltype.typeOf(y) is lltype.Signed
+    result = int(x) // y
+    assert result == intmask(result), "overflow in long2_floordiv"
+    return result
+
 # ____________________________________________________________
 
 def get_op_impl(opname):
diff --git a/rpython/translator/c/src/asm_gcc_x86.h b/rpython/translator/c/src/asm_gcc_x86.h
--- a/rpython/translator/c/src/asm_gcc_x86.h
+++ b/rpython/translator/c/src/asm_gcc_x86.h
@@ -106,3 +106,11 @@
 #define PYPY_X86_CHECK_SSE2_DEFINED
 RPY_EXTERN void pypy_x86_check_sse2(void);
 #endif
+
+
+#undef OP_LONG2_FLOORDIV
+/* assumes that 'y' and 'r' fit in a signed word, 
+   but 'x' takes up to two words */
+#define OP_LONG2_FLOORDIV(x, y, r)                              \
+    __asm__("idiv %1" : "=a"(r) :                               \
+            "r"((long)y), "A"((long long)x));
diff --git a/rpython/translator/c/src/asm_gcc_x86_64.h b/rpython/translator/c/src/asm_gcc_x86_64.h
--- a/rpython/translator/c/src/asm_gcc_x86_64.h
+++ b/rpython/translator/c/src/asm_gcc_x86_64.h
@@ -6,3 +6,10 @@
     asm volatile("rdtsc" : "=a"(_rax), "=d"(_rdx)); \
     val = (_rdx << 32) | _rax;                          \
 } while (0)
+
+#undef OP_LONG2_FLOORDIV
+/* assumes that 'y' and 'r' fit in a signed word, 
+   but 'x' takes up to two words */
+#define OP_LONG2_FLOORDIV(x, y, r)                              \
+    __asm__("idiv %1" : "=a"(r) :                               \
+            "r"((long)y), "a"((long)x), "d"((long)((x >> 32) >> 32)))
diff --git a/rpython/translator/c/src/int.h b/rpython/translator/c/src/int.h
--- a/rpython/translator/c/src/int.h
+++ b/rpython/translator/c/src/int.h
@@ -135,6 +135,7 @@
 #define OP_LLONG_FLOORDIV(x,y,r)  r = (x) / (y)
 #define OP_ULLONG_FLOORDIV(x,y,r) r = (x) / (y)
 #define OP_LLLONG_FLOORDIV(x,y,r)  r = (x) / (y)
+#define OP_LONG2_FLOORDIV(x,y,r)  r = (x) / (y)
 
 /* modulus */
 
diff --git a/rpython/translator/c/test/test_lltyped.py b/rpython/translator/c/test/test_lltyped.py
--- a/rpython/translator/c/test/test_lltyped.py
+++ b/rpython/translator/c/test/test_lltyped.py
@@ -1,6 +1,7 @@
-import py
+import py, sys, random
 from rpython.rtyper.lltypesystem.lltype import *
 from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.translator.c.test.test_genc import compile
 from rpython.tool.sourcetools import func_with_new_name
 
@@ -1023,3 +1024,27 @@
         assert fn(r_longlong(1)) == True
         assert fn(r_longlong(256)) == True
         assert fn(r_longlong(2**32)) == True
+
+    def test_long2_floordiv(self):
+        def f(a, b):
+            return llop.long2_floordiv(Signed, a, b)
+        fn = self.getcompiled(f, [int, int])
+        assert fn(100, 3) == 33
+        #
+        if sys.maxint > 2**32:
+            HUGE = getattr(rffi, '__INT128_T', None)
+            bits = 128
+        else:
+            HUGE = SignedLongLong
+            bits = 64
+        if HUGE is not None:
+            def f(a, b, c):
+                ab = (rffi.cast(HUGE, a) << (bits//2)) | b
+                return llop.long2_floordiv(Signed, ab, c)
+            fn = self.getcompiled(f, [int, int, int])
+            for i in range(100):
+                a = random.randrange(0, 10)
+                b = random.randrange(0, sys.maxint+1)
+                c = random.randrange(2*a+2, 25)
+                print a, b, c
+                assert fn(a, b, c) == ((a << (bits//2)) | b) // c