[pypy-commit] pypy ppc-vsx-support: vectorized reduction test now passes (ppc)

Mon Jul 4 09:30:34 EDT 2016

Author: Richard Plangger <planrichi at gmail.com>
Branch: ppc-vsx-support
Changeset: r85536:1f00adc7b0fd
Date: 2016-07-04 15:29 +0200
http://bitbucket.org/pypy/pypy/changeset/1f00adc7b0fd/

Log:	vectorized reduction test now passes (ppc)

diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -713,6 +713,8 @@
     vperm = VA(4, XO10=43)
     vsel = VA(4, XO10=42) 
     vspltisb = VXI(4, XO8=780)
+    vspltisw = VXI(4, XO8=844)
+    vspltisw = VXI(4, XO8=908)
 
     VX_splat = Form("ivrT", "ivrB", "ivrA", "XO8")
     vspltb = VX_splat(4, XO8=524)
diff --git a/rpython/jit/backend/ppc/condition.py b/rpython/jit/backend/ppc/condition.py
--- a/rpython/jit/backend/ppc/condition.py
+++ b/rpython/jit/backend/ppc/condition.py
@@ -6,6 +6,10 @@
 GE = 5
 SO = 6
 NS = 7
+VEQ = 8
+VEQI = 9
+VNE = 10
+VNEI = 11
 cond_none = -1    # invalid
 
 def negate(cond):
@@ -19,6 +23,8 @@
 assert negate(GE) == LT
 assert negate(SO) == NS
 assert negate(NS) == SO
+assert negate(VEQ) == VEQI
+assert negate(VNE) == VNEI
 
 encoding = [
     (2, 12),   # EQ
@@ -29,4 +35,8 @@
     (0, 4),    # GE
     (3, 12),   # SO
     (3, 4),    # NS
+    (24, 12),   # VEQ
+    (24, 4),    # VEQI
+    (26, 12),    # VNE
+    (26, 4),    # VNEI
 ]
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -1369,7 +1369,6 @@
             allocation. This needs remapping which is done here for both normal registers
             and accumulation registers.
         """
-        import pdb; pdb.set_trace()
         asminfo, bridge_faildescr, version, looptoken = target
         assert isinstance(bridge_faildescr, ResumeGuardDescr)
         assert isinstance(faildescr, ResumeGuardDescr)
@@ -1385,7 +1384,6 @@
         # if accumulation is saved at the guard, we need to update it here!
         guard_locs = self.rebuild_faillocs_from_descr(faildescr, version.inputargs)
         bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr, version.inputargs)
-        #import pdb; pdb.set_trace()
         guard_accum_info = faildescr.rd_vector_info
         # O(n**2), but usually you only have at most 1 fail argument
         while guard_accum_info:
diff --git a/rpython/jit/backend/ppc/vector_ext.py b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -34,7 +34,18 @@
     # "propagate it between this operation and the next guard by keeping
     # it in the cc".  In the uncommon case, result_loc is another
     # register, and we emit a load from the cc into this register.
+
+    # Possibly invert the bit in the CR
+    bit, invert = c.encoding[condition]
+    assert 24 <= bit <= 27
+    if invert == 12:
+        pass
+    elif invert == 4:
+        asm.mc.crnor(bit, bit, bit)
+    else:
+        assert 0
     assert asm.guard_success_cc == c.cond_none
+    #
     if result_loc is r.SPP:
         asm.guard_success_cc = condition
     else:
@@ -386,6 +397,7 @@
             self.mc.vcmpgtuwx(resloc.value, argloc.value, tmp)
         elif size == 8:
             self.mc.vcmpgtudx(resloc.value, argloc.value, tmp)
+        flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc)
 
     def emit_vec_float_eq(self, op, arglocs, regalloc):
         resloc, loc1, loc2, sizeloc = arglocs
@@ -404,7 +416,7 @@
         else:
             notimplemented("[ppc/assembler] float == for size %d" % size)
         self.mc.lvx(resloc.value, off, r.SP.value)
-        flush_vec_cc(self, regalloc, c.EQ, op.bytesize, resloc)
+        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc)
 
     def emit_vec_float_xor(self, op, arglocs, regalloc):
         resloc, l0, l1, sizeloc = arglocs
@@ -432,7 +444,7 @@
         res = resloc.value
         self.mc.lvx(res, off, r.SP.value)
         self.mc.vnor(res, res, res) # complement
-        flush_vec_cc(self, regalloc, c.NE, op.bytesize, resloc)
+        flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc)
 
     def emit_vec_cast_int_to_float(self, op, arglocs, regalloc):
         res, l0 = arglocs
@@ -455,7 +467,7 @@
             self.mc.vcmpequwx(res.value, l0.value, l1.value)
         elif size == 8:
             self.mc.vcmpequdx(res.value, l0.value, l1.value)
-        flush_vec_cc(self, regalloc, c.EQ, op.bytesize, res)
+        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res)
 
     def emit_vec_int_ne(self, op, arglocs, regalloc):
         res, l0, l1, sizeloc = arglocs
@@ -471,7 +483,7 @@
         elif size == 8:
             self.mc.vcmpequdx(res.value, res.value, tmp)
         self.mc.vnor(res.value, res.value, res.value)
-        flush_vec_cc(self, regalloc, c.NE, op.bytesize, res)
+        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res)
 
     def emit_vec_expand_f(self, op, arglocs, regalloc):
         resloc, srcloc = arglocs
@@ -549,7 +561,7 @@
         if size == 8:
             if srcloc.is_vector_reg(): # reg <- vector
                 assert not resloc.is_vector_reg()
-                self.mc.load_imm(r.SCRATCH, PARAM_SAVE_AREA_OFFSET)
+                self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
                 self.mc.stvx(src, r.SCRATCH2.value, r.SP.value)
                 self.mc.load(res, r.SP.value, PARAM_SAVE_AREA_OFFSET+8*idx)
             else:
@@ -621,6 +633,17 @@
         else:
             return self.ivrm.force_allocate_reg(op, forbidden_vars)
 
+    def force_allocate_vector_reg_or_cc(self, op):
+        assert op.type == INT
+        if self.next_op_can_accept_cc(self.operations, self.rm.position):
+            # hack: return the SPP location to mean "lives in CC".  This
+            # SPP will not actually be used, and the location will be freed
+            # after the next op as usual.
+            self.rm.force_allocate_frame_reg(op)
+            return r.SPP
+        else:
+            return self.force_allocate_vector_reg(op)
+
     def ensure_vector_reg(self, box):
         if box.type == FLOAT:
             return self.vrm.make_sure_var_in_reg(box,
@@ -691,14 +714,25 @@
     prepare_vec_int_and = prepare_vec_arith
     prepare_vec_int_or = prepare_vec_arith
     prepare_vec_int_xor = prepare_vec_arith
-
-    prepare_vec_float_eq = prepare_vec_arith
-    prepare_vec_float_ne = prepare_vec_arith
-    prepare_vec_int_eq = prepare_vec_arith
-    prepare_vec_int_ne = prepare_vec_arith
     prepare_vec_float_xor = prepare_vec_arith
     del prepare_vec_arith
 
+    def prepare_vec_bool(self, op):
+        a0 = op.getarg(0)
+        a1 = op.getarg(1)
+        assert isinstance(op, VectorOp)
+        size = op.bytesize
+        args = op.getarglist()
+        loc0 = self.ensure_vector_reg(a0)
+        loc1 = self.ensure_vector_reg(a1)
+        resloc = self.force_allocate_vector_reg_or_cc(op)
+        return [resloc, loc0, loc1, imm(size)]
+
+    prepare_vec_float_eq = prepare_vec_bool
+    prepare_vec_float_ne = prepare_vec_bool
+    prepare_vec_int_eq = prepare_vec_bool
+    prepare_vec_int_ne = prepare_vec_bool
+    del prepare_vec_bool
 
     def prepare_vec_store(self, op):
         descr = op.getdescr()
@@ -826,7 +860,7 @@
         arg = op.getarg(0)
         assert isinstance(arg, VectorOp)
         argloc = self.ensure_vector_reg(arg)
-        resloc = self.force_allocate_vector_reg(op)
+        resloc = self.force_allocate_vector_reg_or_cc(op)
         return [resloc, argloc, imm(arg.bytesize)]
 
     def _prepare_vec(self, op):
@@ -843,19 +877,11 @@
 
     prepare_vec_cast_int_to_float = prepare_vec_cast_float_to_int
 
-    def load_vector_condition_into_cc(self, box):
-        if self.assembler.guard_success_cc == c.cond_none:
-            # compare happended before
-            #loc = self.ensure_reg(box)
-            #mc = self.assembler.mc
-            #mc.cmp_op(0, loc.value, 0, imm=True)
-            self.assembler.guard_success_cc = c.NE
-
     def prepare_vec_guard_true(self, op):
-        self.load_vector_condition_into_cc(op.getarg(0))
+        self.assembler.guard_success_cc = c.VEQ
         return self._prepare_guard(op)
 
     def prepare_vec_guard_false(self, op):
-        self.load_vector_condition_into_cc(op.getarg(0))
+        self.assembler.guard_success_cc = c.VNE
         return self._prepare_guard(op)
 
diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -332,29 +332,29 @@
 
     @py.test.mark.parametrize('type,func,init,insert,at,count,breaks',
             # all
-           [#(rffi.DOUBLE, lambda x: not bool(x), 1.0, None, -1,32, False),
-            #(rffi.DOUBLE, lambda x: x == 0.0,    1.0, None, -1,33, False),
-            #(rffi.DOUBLE, lambda x: x == 0.0,    1.0, 0.0,  33,34, True),
-            #(rffi.DOUBLE, lambda x: x == 0.0,    1.0, 0.1,  4,34, False),
-            #(lltype.Signed, lambda x: not bool(x), 1, None, -1,32, False),
+           [(rffi.DOUBLE, lambda x: not bool(x), 1.0, None, -1,32, False),
+            (rffi.DOUBLE, lambda x: x == 0.0,    1.0, None, -1,33, False),
+            (rffi.DOUBLE, lambda x: x == 0.0,    1.0, 0.0,  33,34, True),
+            (rffi.DOUBLE, lambda x: x == 0.0,    1.0, 0.1,  4,34, False),
+            (lltype.Signed, lambda x: not bool(x), 1, None, -1,32, False),
             (lltype.Signed, lambda x: not bool(x), 1, 0,    14,32, True),
-            #(lltype.Signed, lambda x: not bool(x), 1, 0,    15,31, True),
-            #(lltype.Signed, lambda x: not bool(x), 1, 0,    4,30, True),
-            #(lltype.Signed, lambda x: x == 0,      1, None, -1,33, False),
-            #(lltype.Signed, lambda x: x == 0,      1, 0,  33,34, True),
-            ## any
-            #(rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  33,35, True),
-            #(rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  -1,36, False),
-            #(rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  33,37, True),
-            #(rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  -1,38, False),
-            #(lltype.Signed, lambda x: x != 0,    0, 1,  33,35, True),
-            #(lltype.Signed, lambda x: x != 0,    0, 1,  -1,36, False),
-            #(lltype.Signed, lambda x: bool(x),   0, 1,  33,37, True),
-            #(lltype.Signed, lambda x: bool(x),   0, 1,  -1,38, False),
-            #(rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), rffi.r_int(1),  33,35, True),
-            #(rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), rffi.r_int(1),  -1,36, False),
-            #(rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), rffi.r_int(1),  33,37, True),
-            #(rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), rffi.r_int(1),  -1,38, False),
+            (lltype.Signed, lambda x: not bool(x), 1, 0,    15,31, True),
+            (lltype.Signed, lambda x: not bool(x), 1, 0,    4,30, True),
+            (lltype.Signed, lambda x: x == 0,      1, None, -1,33, False),
+            (lltype.Signed, lambda x: x == 0,      1, 0,  33,34, True),
+            # any
+            (rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  33,35, True),
+            (rffi.DOUBLE, lambda x: x != 0.0,    0.0, 1.0,  -1,36, False),
+            (rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  33,37, True),
+            (rffi.DOUBLE, lambda x: bool(x),     0.0, 1.0,  -1,38, False),
+            (lltype.Signed, lambda x: x != 0,    0, 1,  33,35, True),
+            (lltype.Signed, lambda x: x != 0,    0, 1,  -1,36, False),
+            (lltype.Signed, lambda x: bool(x),   0, 1,  33,37, True),
+            (lltype.Signed, lambda x: bool(x),   0, 1,  -1,38, False),
+            (rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), rffi.r_int(1),  33,35, True),
+            (rffi.INT, lambda x: intmask(x) != 0,    rffi.r_int(0), rffi.r_int(1),  -1,36, False),
+            (rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), rffi.r_int(1),  33,37, True),
+            (rffi.INT, lambda x: bool(intmask(x)),   rffi.r_int(0), rffi.r_int(1),  -1,38, False),
            ])
     def test_bool_reduction(self, type, func, init, insert, at, count, breaks):
         myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)