[pypy-commit] pypy vecopt: resolved an issue that would generate wrong packing immediates for shufps.

plan_rich noreply at buildbot.pypy.org
Fri May 15 09:39:19 CEST 2015


Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77328:2a8cae0c7c8e
Date: 2015-05-15 09:39 +0200
http://bitbucket.org/pypy/pypy/changeset/2a8cae0c7c8e/

Log:	resolved an issue that would generate wrong packing immediates for
	shufps. a better approach in the assembler is needed to handle these
	pack/unpack instructions

diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -6,10 +6,16 @@
 from rpython.jit.metainterp.test.support import LLJitMixin
 from rpython.jit.backend.x86.test.test_basic import Jit386Mixin
 from rpython.jit.metainterp.warmspot import reset_jit, get_stats
+from rpython.jit.metainterp.jitprof import Profiler
+from rpython.rlib.jit import Counters
 from pypy.module.micronumpy import boxes
 from pypy.module.micronumpy.compile import FakeSpace, Parser, InterpreterState
 from pypy.module.micronumpy.base import W_NDimArray
 
+def get_profiler():
+    from rpython.jit.metainterp import pyjitpl
+    return pyjitpl._warmrunnerdesc.metainterp_sd.profiler
+
 class TestNumpyJit(Jit386Mixin):
     graph = None
     interp = None
@@ -79,12 +85,23 @@
                                              listcomp=True,
                                              backendopt=True,
                                              graph_and_interp_only=True,
+                                             ProfilerClass=Profiler,
                                              vectorize=True)
             self.__class__.interp = interp
             self.__class__.graph = graph
 
+    def check_vectorized(self, expected_tried, expected_success):
+        profiler = get_profiler()
+        tried = profiler.get_counter(Counters.OPT_VECTORIZE_TRY)
+        success = profiler.get_counter(Counters.OPT_VECTORIZED)
+        assert tried >= success
+        assert tried == expected_tried
+        assert success == expected_success
+
     def run(self, name):
         self.compile_graph()
+        profiler = get_profiler()
+        profiler.start()
         reset_jit()
         i = self.code_mapping[name]
         retval = self.interp.eval_graph(self.graph, [i])
@@ -92,23 +109,25 @@
 
     def define_float32_add():
         return """
-        a = |30|
+        a = astype(|30|, float32)
         b = a + a
         b -> 15
         """
     def test_float32_add(self):
         result = self.run("float32_add")
         self.assert_float_equal(result, 15.0 + 15.0)
+        self.check_vectorized(2, 2)
 
     def define_float_add():
         return """
-        a = astype(|30|, float32)
+        a = |30|
         b = a + a
-        b -> 17
+        b -> 15
         """
     def test_float_add(self):
         result = self.run("float_add")
         self.assert_float_equal(result, 17.0 + 17.0)
+        self.check_vectorized(1, 1)
 
     def define_float32_add_const():
         return """
@@ -119,6 +138,7 @@
     def test_float32_add_const(self):
         result = self.run("float32_add_const")
         self.assert_float_equal(result, 29.0 + 77.345)
+        self.check_vectorized(2, 2)
 
     def define_float_add_const():
         return """
@@ -128,6 +148,7 @@
     def test_float_add_const(self):
         result = self.run("float_add_const")
         self.assert_float_equal(result, 29.0 + 25.5)
+        self.check_vectorized(1, 1)
 
     def define_pow():
         return """
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -7,7 +7,7 @@
                                                 DEBUG_COUNTER, debug_bridge)
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
-from rpython.jit.metainterp.history import Const, Box, VOID, BoxVector
+from rpython.jit.metainterp.history import Const, Box, VOID, BoxVector, ConstInt
 from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
@@ -2576,30 +2576,36 @@
             return src_loc
         select = 0
         if item_type == FLOAT:
-            self.mc.MOVSS(tmp_loc, src_loc)
-            i = 0
-            while i < count:
-                select |= (index+i<<(i*2))
-                i += 1
-            self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
-            return tmp_loc
+            if size == 4:
+                self.mc.MOVUPS(tmp_loc, src_loc) # TODO could be aligned if xx
+                i = 0
+                while i < count:
+                    select |= (index+i<<(i*2))
+                    i += 1
+                self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
+                return tmp_loc
+            else:
+                py.test.set_trace()
+                raise NotImplementedError("shuffle by index for float64 not impl")
         else:
             py.test.set_trace()
             raise NotImplementedError("shuffle by index for non floats")
 
 
     def genop_vec_box_pack(self, op, arglocs, resloc):
-        toloc, fromloc, indexloc, sizeloc = arglocs
-        toarg = op.getarg(0)
-        index = indexloc.value
-        size = sizeloc.value
+        toloc, fromloc, tmploc = arglocs
+        result = op.result
+        indexarg = op.getarg(2)
+        assert isinstance(result, BoxVector)
+        assert isinstance(indexarg, ConstInt)
+        index = indexarg.value
+        size = result.item_size
+        #py.test.set_trace()
         if size == 4:
-            select = 0
+            select = (1 << 2) # move 0 -> 0, 1 -> 1 for toloc
+            # TODO
             if index == 2:
-                select |= (1<<0)
-                select |= (2<<2)
-                select |= (3<<4)
-                select |= (4<<6)
+                select |= (1<<6) # move 0 -> 2, 1 -> 3 for fromloc
             else:
                 raise NotImplementedError("index is not equal to 2")
 
@@ -2621,7 +2627,7 @@
             self.mc.CVTPS2PD(resloc, loc0)
         else:
             assert index == 2
-            self.mc.MOVSS_xx(tmploc.value, loc0.value)
+            self.mc.MOVUPS(tmploc, loc0) # TODO could be aligned if xx
             select = (2<<0)|(3<<2) # move pos 2->0,3->1
             self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
             self.mc.CVTPS2PD(resloc, tmploc) # expand
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1477,7 +1477,7 @@
         assert not descr.is_array_of_pointers() and \
                not descr.is_array_of_structs()
         itemsize, ofs, _ = unpack_arraydescr(descr)
-        integer = not descr.is_array_of_floats()
+        integer = not (descr.is_array_of_floats() or descr.concrete_type == FLOAT)
         aligned = False
         args = op.getarglist()
         base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
@@ -1498,7 +1498,7 @@
         value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
         ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
 
-        integer = not descr.is_array_of_floats()
+        integer = not (descr.is_array_of_floats() or descr.concrete_type == FLOAT)
         aligned = False
         self.perform_discard(op, [base_loc, ofs_loc, value_loc,
                                  imm(itemsize), imm(ofs), imm(integer), imm(aligned)])
@@ -1536,15 +1536,13 @@
     del consider_vec_logic
 
     def consider_vec_box_pack(self, op):
-        count = op.getarg(3)
-        index = op.getarg(2)
-        assert isinstance(count, ConstInt)
-        assert isinstance(index, ConstInt)
-        itemsize = self.assembler.cpu.vector_register_size // count.value
         args = op.getarglist()
-        loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
         loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
-        self.perform(op, [loc0, loc1, imm(index.value), imm(itemsize)], None)
+        result =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        tmpxvar = TempBox()
+        tmploc = self.xrm.force_allocate_reg(tmpxvar)
+        self.xrm.possibly_free_var(tmpxvar)
+        self.perform(op, [result, loc1, tmploc], result)
 
     def consider_vec_box_unpack(self, op):
         count = op.getarg(2)
diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -563,7 +563,7 @@
         raise NotImplementedError("cannot forget value of vector")
 
     def clonebox(self):
-        return BoxVector(self.item_type, self.item_count)
+        return BoxVector(self.item_type, self.item_count, self.item_size, self.signed)
 
     def constbox(self):
         raise NotImplementedError("not possible to have a constant vector box")
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1192,8 +1192,8 @@
         v224 = vec_float_add(v219, v222, 2)
         v225 = vec_cast_float_to_singlefloat(v223, 2)
         v226 = vec_cast_float_to_singlefloat(v224, 2)
-        vec_box_pack(v225, v226, 2, 2)
-        vec_raw_store(p2, i4, v225, 4, descr=singlefloatarraydescr)
+        v227 = vec_box_pack(v225, v226, 2, 2)
+        vec_raw_store(p2, i4, v227, 4, descr=singlefloatarraydescr)
         jump(p0, p1, p2, i210, i189)
         """
         vopt = self.vectorize(self.parse_loop(ops))
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -13,6 +13,7 @@
 from rpython.jit.metainterp.resoperation import (rop, ResOperation, GuardResOp)
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
+from rpython.rlib.jit import Counters
 from rpython.rtyper.lltypesystem import lltype, rffi
 
 class NotAVectorizeableLoop(JitException):
@@ -42,10 +43,10 @@
                     inline_short_preamble, start_state, False)
     orig_ops = loop.operations
     try:
-        jitdriver_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
+        metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
         opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, optimizations)
         opt.propagate_all_forward()
-        jitdriver_sd.profiler.count(Counters.OPT_VECTORIZED)
+        metainterp_sd.profiler.count(Counters.OPT_VECTORIZED)
     except NotAVectorizeableLoop:
         # vectorization is not possible, propagate only normal optimizations
         loop.operations = orig_ops
@@ -690,8 +691,6 @@
                 else:
                     # vbox of a variable/constant is not present here
                     pass
-        if not we_are_translated():
-            assert ptype.is_valid()
         self.pack.ptype = ptype
 
     def vector_result(self, vop, packargs):
@@ -731,6 +730,7 @@
         if packed < packable:
             args = [op.getoperation().getarg(argidx) for op in ops]
             self.package(vbox, packed, args, packable)
+            _, vbox = self.box_to_vbox.get(vop.getarg(argidx), (-1, None))
         vop.setarg(argidx, vbox)
         return vbox
 
@@ -749,13 +749,40 @@
             if pos == -1:
                 i += 1
                 continue
+            new_box = tgt_box.clonebox()
+            new_box.item_count += src_box.item_count
             op = ResOperation(rop.VEC_BOX_PACK,
                               [tgt_box, src_box, ConstInt(i),
-                               ConstInt(src_box.item_count)], None)
+                               ConstInt(src_box.item_count)], new_box)
             self.preamble_ops.append(op)
-            tgt_box.item_count += src_box.item_count
+            self._check_vec_pack(op)
             i += src_box.item_count
 
+            # overwrite the new positions, arguments now live in new_box
+            # at a new position
+            for j in range(i):
+                arg = args[j]
+                self.box_to_vbox[arg] = (j, new_box)
+
+    def _check_vec_pack(self, op):
+        result = op.result
+        arg0 = op.getarg(0)
+        arg1 = op.getarg(1)
+        index = op.getarg(2)
+        count = op.getarg(3)
+        assert isinstance(result, BoxVector)
+        assert isinstance(arg0, BoxVector)
+        assert isinstance(index, ConstInt)
+        assert isinstance(count, ConstInt)
+        assert arg0.item_size == result.item_size
+        if isinstance(arg1, BoxVector):
+            assert arg1.item_size == result.item_size
+        else:
+            assert count.value == 1
+        assert index.value < result.item_size
+        assert index.value + count.value <= result.item_size
+        assert result.item_count > arg0.item_count
+
     def expand_box_to_vector_box(self, vop, argidx):
         arg = vop.getarg(argidx)
         all_same_box = True


More information about the pypy-commit mailing list