[pypy-commit] pypy ppc-vsx-support: first vector loop successfully compiles on ppc (floating point only)

Mon Jun 20 03:06:08 EDT 2016

Author: Richard Plangger <planrichi at gmail.com>
Branch: ppc-vsx-support
Changeset: r85231:1352b56d157d
Date: 2016-06-20 09:05 +0200
http://bitbucket.org/pypy/pypy/changeset/1352b56d157d/

Log:	first vector loop successfully compiles on ppc (floating point only)

diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -61,7 +61,8 @@
 XFL = Form("FM", "frB", "XO1", "Rc")
 XFX = Form("CRM", "rS", "XO1")
 XLL = Form("LL", "XO1")
-XX1 = Form("vrT", "rA", "rB", "XO1")
+XX1 = Form("fvrT", "rA", "rB", "XO1")
+XX3 = Form("fvrT", "fvrA", "fvrB", "XO9")
 VX = Form("lvrT", "lvrA", "lvrB", "XO8")
 
 MI = Form("rA", "rS", "SH", "MB", "ME", "Rc")
@@ -576,6 +577,12 @@
 class PPCVSXAssembler(object):
     _mixin_ = True
 
+    # floating point operations (ppc got it's own vector
+    # unit for double/single precision floating points
+
+    # FLOAT
+    # -----
+
     # load
     lxvdsx = XX1(31, XO1=332) # splat first element
     lxvd2x = XX1(31, XO1=844)
@@ -585,7 +592,23 @@
     stxvd2x = XX1(31, XO1=972)
     stxvw4x = XX1(31, XO1=908)
 
-    # integer
+    # arith
+
+    # add
+    xvadddp = XX3(60, XO9=96)
+    xvaddsp = XX3(60, XO9=64)
+    # sub
+    xvsubdp = XX3(60, XO9=104)
+    xvsubsp = XX3(60, XO9=72)
+    # mul
+    xvmuldp = XX3(60, XO9=112)
+    xvmulsp = XX3(60, XO9=80)
+    # div
+    xvdivdp = XX3(60, XO9=102)
+    xvdivsp = XX3(60, XO9=88)
+
+    # INTEGER
+    # -------
     vaddudm = VX(4, XO8=192)
 
 class PPCAssembler(BasicPPCAssembler, PPCVSXAssembler):
diff --git a/rpython/jit/backend/ppc/ppc_field.py b/rpython/jit/backend/ppc/ppc_field.py
--- a/rpython/jit/backend/ppc/ppc_field.py
+++ b/rpython/jit/backend/ppc/ppc_field.py
@@ -43,7 +43,9 @@
     "spr":    (11, 20),
     "TO":     ( 6, 10),
     "UIMM":   (16, 31),
-    "vrT":    (6,  31, 'unsigned', regname._V, 'overlap'),
+    "fvrT":   (6,  31, 'unsigned', regname._V, 'overlap'),
+    "fvrA":   (11, 31, 'unsigned', regname._V, 'overlap'),
+    "fvrB":   (16, 31, 'unsigned', regname._V, 'overlap'),
     # low vector register T (low in a sense:
     # can only address 32 vector registers)
     "lvrT":   (6,  10, 'unsigned', regname._V),
@@ -59,6 +61,7 @@
     "XO6":    (21, 29),
     "XO7":    (27, 30),
     "XO8":    (21, 31),
+    "XO9":    (21, 28),
     "LL":     ( 9, 10),
 }
 
@@ -110,18 +113,6 @@
         value = super(sh, self).decode(inst)
         return (value & 32) << 5 | (value >> 10 & 31)
 
-# ??? class tx(Field):
-# ???     def encode(self, value):
-# ???         value = (value & 31) << 20 | (value & 32) >> 5
-# ???         return super(tx, self).encode(value)
-# ???     def decode(self, inst):
-# ???         value = super(tx, self).decode(inst)
-# ???         return (value & 32) << 5 | (value >> 20 & 31)
-# ???     def r(self):
-# ???         import pdb; pdb.set_trace()
-# ???         return super(tx, self).r()
-# other special fields?
-
 ppc_fields = {
     "LI":  IField("LI", *fields["LI"]),
     "BD":  IField("BD", *fields["BD"]),
@@ -129,7 +120,6 @@
     "mbe": mbe("mbe",   *fields["mbe"]),
     "sh":  sh("sh",     *fields["sh"]),
     "spr": spr("spr",   *fields["spr"]),
-    # ??? "vrT": tx("vrT",    *fields["vrT"]),
 }
 
 for f in fields:
diff --git a/rpython/jit/backend/ppc/rassemblermaker.py b/rpython/jit/backend/ppc/rassemblermaker.py
--- a/rpython/jit/backend/ppc/rassemblermaker.py
+++ b/rpython/jit/backend/ppc/rassemblermaker.py
@@ -46,9 +46,15 @@
         elif field.name == 'sh':
             body.append('sh1 = (%s & 31) << 10 | (%s & 32) >> 5' % (value, value))
             value = 'sh1'
-        elif field.name == 'vrT':
+        elif field.name == 'fvrT':
             body.append('vrT1 = (%s & 31) << 21 | (%s & 32) >> 5' % (value, value))
             value = 'vrT1'
+        elif field.name == 'fvrA':
+            body.append('fvrA1 = ((%s & 31) << 15 | (%s & 32) >> 5) << 2' % (value, value))
+            value = 'fvrA1'
+        elif field.name == 'fvrB':
+            body.append('fvrB1 = ((%s & 31) << 10 | (%s & 32) >> 5) << 1' % (value, value))
+            value = 'fvrB1'
         if isinstance(field, IField):
             body.append('v |= ((%3s >> 2) & r_uint(%#05x)) << 2' % (value, field.mask))
         else:
diff --git a/rpython/jit/backend/ppc/vector_ext.py b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -48,19 +48,12 @@
 
     def _vec_load(self, resloc, baseloc, indexloc, integer, itemsize, aligned):
         if integer:
+            raise NotImplementedError
+        else:
             if itemsize == 4:
                 self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
             elif itemsize == 8:
                 self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
-            else:
-                raise NotImplementedError
-        else:
-            if itemsize == 4:
-                self.mc.MOVUPS(resloc, src_addr)
-            elif itemsize == 8:
-                self.mc.MOVUPD(resloc, src_addr)
-            else:
-                raise NotImplementedError
 
     def _emit_vec_setitem(self, op, arglocs, regalloc):
         # prepares item scale (raw_store does not)
@@ -83,14 +76,12 @@
 
     def _vec_store(self, baseloc, indexloc, valueloc, integer, itemsize, aligned):
         if integer:
+            raise NotImplementedError
+        else:
             if itemsize == 4:
                 self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value)
             elif itemsize == 8:
                 self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
-            else:
-                raise NotImplementedError
-        else:
-            raise NotImplementedError
 
 
     def emit_vec_int_add(self, op, arglocs, regalloc):
@@ -103,8 +94,41 @@
         elif size == 4:
             raise NotImplementedError
         elif size == 8:
+            raise NotImplementedError # need value in another register!
             self.mc.vaddudm(resloc.value, loc0.value, loc1.value)
 
+    def emit_vec_float_add(self, op, arglocs, resloc):
+        resloc, loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 4:
+            self.mc.xvaddsp(resloc.value, loc0.value, loc1.value)
+        elif itemsize == 8:
+            self.mc.xvadddp(resloc.value, loc0.value, loc1.value)
+
+    def emit_vec_float_sub(self, op, arglocs, resloc):
+        resloc, loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 4:
+            self.mc.xvsubsp(resloc.value, loc0.value, loc1.value)
+        elif itemsize == 8:
+            self.mc.xvsubdp(resloc.value, loc0.value, loc1.value)
+
+    def emit_vec_float_mul(self, op, arglocs, resloc):
+        resloc, loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 4:
+            self.mc.xvmulsp(resloc.value, loc0.value, loc1.value)
+        elif itemsize == 8:
+            self.mc.xvmuldp(resloc.value, loc0.value, loc1.value)
+
+    def emit_vec_float_truediv(self, op, arglocs, resloc):
+        resloc, loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 4:
+            self.mc.xvdivsp(resloc.value, loc0.value, loc1.value)
+        elif itemsize == 8:
+            self.mc.xvdivdp(resloc.value, loc0.value, loc1.value)
+
     #def genop_guard_vec_guard_true(self, guard_op, guard_token, locs, resloc):
     #    self.implement_guard(guard_token)
 
@@ -253,23 +277,6 @@
     #def genop_vec_int_xor(self, op, arglocs, resloc):
     #    self.mc.PXOR(resloc, arglocs[0])
 
-    #genop_vec_float_arith = """
-    #def genop_vec_float_{type}(self, op, arglocs, resloc):
-    #    loc0, loc1, itemsize_loc = arglocs
-    #    itemsize = itemsize_loc.value
-    #    if itemsize == 4:
-    #        self.mc.{p_op_s}(loc0, loc1)
-    #    elif itemsize == 8:
-    #        self.mc.{p_op_d}(loc0, loc1)
-    #"""
-    #for op in ['add','mul','sub']:
-    #    OP = op.upper()
-    #    _source = genop_vec_float_arith.format(type=op,
-    #                                           p_op_s=OP+'PS',
-    #                                           p_op_d=OP+'PD')
-    #    exec py.code.Source(_source).compile()
-    #del genop_vec_float_arith
-
     #def genop_vec_float_truediv(self, op, arglocs, resloc):
     #    loc0, loc1, sizeloc = arglocs
     #    size = sizeloc.value
@@ -569,10 +576,10 @@
     prepare_vec_int_add = prepare_vec_arith
     #prepare_vec_int_sub = prepare_vec_arith
     #prepare_vec_int_mul = prepare_vec_arith
-    #prepare_vec_float_add = prepare_vec_arith
-    #prepare_vec_float_sub = prepare_vec_arith
-    #prepare_vec_float_mul = prepare_vec_arith
-    #prepare_vec_float_truediv = prepare_vec_arith
+    prepare_vec_float_add = prepare_vec_arith
+    prepare_vec_float_sub = prepare_vec_arith
+    prepare_vec_float_mul = prepare_vec_arith
+    prepare_vec_float_truediv = prepare_vec_arith
     del prepare_vec_arith
 
     def _prepare_vec_store(self, op):
diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -1,5 +1,6 @@
 import py
-
+import pytest
+import math
 from hypothesis import given, note, strategies as st
 from rpython.jit.metainterp.warmspot import ll_meta_interp, get_stats
 from rpython.jit.metainterp.test.support import LLJitMixin
@@ -13,7 +14,8 @@
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rawstorage import (alloc_raw_storage, raw_storage_setitem,
                                      free_raw_storage, raw_storage_getitem)
-from rpython.rlib.objectmodel import specialize, is_annotation_constant
+from rpython.rlib.objectmodel import (specialize, is_annotation_constant,
+        always_inline)
 from rpython.jit.backend.detect_cpu import getcpuclass
 
 CPU = getcpuclass()
@@ -24,7 +26,40 @@
 def free(mem):
     lltype.free(mem, flavor='raw')
 
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+class RawStorage(object):
+    def __init__(self):
+        self.arrays = []
+
+    def new(self, values, type, size=None, zero=True):
+        bytecount = rffi.sizeof(type)
+        if not values:
+            array = alloc_raw_storage(size*bytecount, zero=zero)
+            self.arrays.append(array)
+            return array
+        else:
+            size = len(values)*bytecount
+            array = alloc_raw_storage(size, zero=zero)
+            for i,v in enumerate(values):
+                raw_storage_setitem(array, i*bytecount, rffi.cast(type,v))
+            self.arrays.append(array)
+            return array
+
+    def clear(self):
+        while self.arrays:
+            array = self.arrays.pop()
+            free_raw_storage(array)
+
+ at pytest.fixture(scope='session')
+def rawstorage(request):
+    rs = RawStorage()
+    request.addfinalizer(rs.clear)
+    return rs
+
 integers_64bit = st.integers(min_value=-2**63, max_value=2**63-1)
+floats = st.floats()
 
 class VectorizeTests:
     enable_opts = 'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
@@ -40,42 +75,80 @@
                               type_system=self.type_system,
                               vec=vec, vec_all=vec_all)
 
-    @given(st.lists(integers_64bit, min_size=5, max_size=50),
-           st.lists(integers_64bit, min_size=5, max_size=50))
-    def test_vector_simple(self, la, lb):
-        myjitdriver = JitDriver(greens = [],
-                                reds = 'auto',
-                                vectorize=True)
-        i = min(len(la), len(lb))
-        la = la[:i]
-        lb = lb[:i]
-        bc = i*rffi.sizeof(rffi.SIGNED)
-        vc = alloc_raw_storage(bc, zero=True)
+    @given(data=st.data())
+    @pytest.mark.parametrize('func', [lambda a,b: a+b,
+        lambda a,b: a*b, lambda a,b: a-b, lambda a,b: a / b])
+    def test_vector_simple_float(self, func, data):
+        func = always_inline(func)
+
+        type = rffi.DOUBLE
+        size = rffi.sizeof(rffi.DOUBLE)
+        myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
+        def f(bytecount, va, vb, vc):
+            i = 0
+            while i < bytecount:
+                myjitdriver.jit_merge_point()
+                a = raw_storage_getitem(type,va,i)
+                b = raw_storage_getitem(type,vb,i)
+                c = func(a,b)
+                raw_storage_setitem(vc, i, rffi.cast(type,c))
+                i += size
+
+        la = data.draw(st.lists(floats, min_size=10, max_size=150))
+        #la = [0.0,0.0,0.0,0.0,0.0,0.0,0.0]
+        #lb = [0.0,0.0,0.0,0.0,1.7976931348623157e+308,0.0,0.0]
+        l = len(la)
+        lb = data.draw(st.lists(floats, min_size=l, max_size=l))
+
+        rawstorage = RawStorage()
+        va = rawstorage.new(la, type)
+        vb = rawstorage.new(lb, type)
+        vc = rawstorage.new(None, type, size=l)
+        self.meta_interp(f, [l*size, va, vb, vc])
+
+        for i in range(l):
+            c = raw_storage_getitem(type,vc,i*size)
+            r = func(la[i], lb[i])
+            assert isclose(r, c) or (math.isnan(r) and math.isnan(c)) or \
+                   (math.isinf(r) and math.isinf(c) and \
+                    (r < 0.0 and c < 0.0) or \
+                    (r > 0.0 and c > 0.0))
+
+        rawstorage.clear()
+
+    #@given(st.data())
+    def test_vector_simple_int(self):
+
+        type = rffi.SIGNED
         size = rffi.sizeof(rffi.SIGNED)
-        def f(d):
-            va = alloc_raw_storage(bc, zero=True)
-            vb = alloc_raw_storage(bc, zero=True)
-            x = 1
-            for i in range(d):
-                j = i*size
-                raw_storage_setitem(va, j, rffi.cast(rffi.SIGNED,la[i]))
-                raw_storage_setitem(vb, j, rffi.cast(rffi.SIGNED,lb[i]))
+        myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
+        def f(bytecount, va, vb, vc):
             i = 0
-            while i < bc:
+            while i < bytecount:
                 myjitdriver.jit_merge_point()
-                a = raw_storage_getitem(rffi.SIGNED,va,i)
-                b = raw_storage_getitem(rffi.SIGNED,vb,i)
+                a = raw_storage_getitem(type,va,i)
+                b = raw_storage_getitem(type,vb,i)
                 c = a+b
-                raw_storage_setitem(vc, i, rffi.cast(rffi.SIGNED,c))
-                i += 1*size
+                raw_storage_setitem(vc, i, rffi.cast(type,c))
+                i += size
 
-            free_raw_storage(va)
-            free_raw_storage(vb)
-        self.meta_interp(f, [i])
-        for p in range(i):
-            c = raw_storage_getitem(rffi.SIGNED,vc,p*size)
-            assert intmask(la[p] + lb[p]) == c
-        free_raw_storage(vc)
+        rawstorage = RawStorage()
+        #la = data.draw(st.lists(integers_64bit, min_size=10, max_size=150))
+        la = [0] * 10
+        l = len(la)
+        #lb = data.draw(st.lists(integers_64bit, min_size=l, max_size=l))
+        lb = [0] * 10
+
+        va = rawstorage.new(la, lltype.Signed)
+        vb = rawstorage.new(lb, lltype.Signed)
+        vc = rawstorage.new(None, lltype.Signed, size=l)
+        self.meta_interp(f, [l*size, va, vb, vc])
+
+        for i in range(l):
+            c = raw_storage_getitem(type,vc,i*size)
+            assert intmask(la[i] + lb[i]) == c
+
+        rawstorage.clear()
 
     @py.test.mark.parametrize('i',[1,2,3,8,17,128,130,131,142,143])
     def test_vectorize_array_get_set(self,i):