[pypy-commit] pypy zarch-simd-support: some more pack/unpack cases implemented

Fri Sep 16 08:42:05 EDT 2016

Author: Richard Plangger <planrichi at gmail.com>
Branch: zarch-simd-support
Changeset: r87131:2cd9f79ff7de
Date: 2016-09-16 11:59 +0200
http://bitbucket.org/pypy/pypy/changeset/2cd9f79ff7de/

Log:	some more pack/unpack cases implemented

diff --git a/rpython/jit/backend/zarch/instruction_builder.py b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -546,6 +546,30 @@
         self.writechar(opcode2)
     return encode_vri_a
 
+def build_vrs_b(mnemonic, (opcode1,opcode2), argtypes='v,r,db,m'):
+    @builder.arguments(argtypes)
+    def encode_vrs_b(self, v1, r2, db3, m4):
+        self.writechar(opcode1)
+        rbx = (v1 >= 16) << 3
+        byte = (v1 & BIT_MASK_4) << 4 | (r2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        encode_base_displace(self, db3)
+        self.writechar(chr((m4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrs_b
+
+def build_vrs_c(mnemonic, (opcode1,opcode2), argtypes='r,v,db,m'):
+    @builder.arguments(argtypes)
+    def encode_vrs_c(self, r1, v2, db3, m4):
+        self.writechar(opcode1)
+        rbx = (v2 >= 16) << 2
+        byte = (r1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        encode_base_displace(self, db3)
+        self.writechar(chr((m4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrs_c
+
 
 def build_unpack_func(mnemonic, func):
     @always_inline
diff --git a/rpython/jit/backend/zarch/instructions.py b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -334,6 +334,8 @@
     'VMRL':  ('vrr_c', ['\xE7','\x60'], 'v,v,v,m'),
     'VMRH':  ('vrr_c', ['\xE7','\x61'], 'v,v,v,m'),
     'VPDI':  ('vrr_c', ['\xE7','\x84'], 'v,v,v,m'),
+    'VLVG':  ('vrs_b', ['\xE7','\x22']),
+    'VLGV':  ('vrs_c', ['\xE7','\x21']),
 
     # '': ('', ['','']),
 }
diff --git a/rpython/jit/backend/zarch/vector_ext.py b/rpython/jit/backend/zarch/vector_ext.py
--- a/rpython/jit/backend/zarch/vector_ext.py
+++ b/rpython/jit/backend/zarch/vector_ext.py
@@ -263,24 +263,6 @@
         # 4 => bit 1 from the MSB: XxC
         self.mc.VCGD(resloc, loc0, 3, 4, mask.RND_TOZERO.value)
 
-    def emit_vec_expand_f(self, op, arglocs, regalloc):
-        assert isinstance(op, VectorOp)
-        resloc, srcloc = arglocs
-        size = op.bytesize
-        res = resloc.value
-        if isinstance(srcloc, l.ConstFloatLoc):
-            # they are aligned!
-            assert size == 8
-            tloc = regalloc.rm.get_scratch_reg()
-            self.mc.load_imm(tloc, srcloc.value)
-            self.mc.lxvd2x(res, 0, tloc.value)
-        elif size == 8:
-            # splat the low of src to both slots in res
-            src = srcloc.value
-            self.mc.xxspltdl(res, src, src)
-        else:
-            not_implemented("vec expand in this combination not supported")
-
     def emit_vec_expand_i(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
         resloc, loc0 = arglocs
@@ -292,13 +274,12 @@
     def _accum_reduce(self, op, arg, accumloc, targetloc):
         # Currently the accumulator can ONLY be 64 bit float/int
         if arg.type == FLOAT:
-            # r = (r[0]+r[1],r[0]+r[1])
-            self.mc.VMRL(targetloc, accumloc, accumloc, l.MASK_VEC_DWORD)
+            self.mc.VPDI(targetloc, accumloc, accumloc, permi(1,0))
             if op == '+':
-                self.mc.VFA(targetloc, targetloc, accumloc, 3, 0, 0)
+                self.mc.VFA(targetloc, targetloc, accumloc, 3, 0b1000, 0)
                 return
             elif op == '*':
-                self.mc.VFM(targetloc, targetloc, accumloc, 3, 0, 0)
+                self.mc.VFM(targetloc, targetloc, accumloc, 3, 0b1000, 0)
                 return
         else:
             assert arg.type == INT
@@ -317,41 +298,14 @@
 
     def emit_vec_pack_i(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        resultloc, vloc, sourceloc, residxloc, srcidxloc, countloc = arglocs
-        srcidx = srcidxloc.value
+        resloc, vloc, sourceloc, residxloc, srcidxloc, countloc = arglocs
         residx = residxloc.value
         count = countloc.value
-        res = resultloc.value
-        vector = vloc.value
-        src = sourceloc.value
         size = op.bytesize
         assert resultloc.is_vector_reg() # vector <- reg
-        self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
-        self.mc.stvx(vector, r.SCRATCH2.value, r.SP.value)
-        idx = residx
-        if size == 8:
-            if not IS_BIG_ENDIAN:
-                idx = (16 // size) - 1 - idx
-            self.mc.store(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+8*idx)
-        elif size == 4:
-            for j in range(count):
-                idx = j + residx
-                if not IS_BIG_ENDIAN:
-                    idx = (16 // size) - 1 - idx
-                self.mc.stw(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+4*idx)
-        elif size == 2:
-            for j in range(count):
-                idx = j + residx
-                if not IS_BIG_ENDIAN:
-                    idx = (16 // size) - 1 - idx
-                self.mc.sth(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+2*idx)
-        elif size == 1:
-            for j in range(count):
-                idx = j + residx
-                if not IS_BIG_ENDIAN:
-                    idx = (16 // size) - 1 - idx
-                self.mc.stb(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+idx)
-        self.mc.lvx(res, r.SCRATCH2.value, r.SP.value)
+        for j in range(count):
+            index = l.addr(j + residx)
+            self.mc.VLVG(resloc, sourceloc, index, l.itemsize_to_mask(size))
 
     def emit_vec_unpack_i(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
@@ -364,44 +318,26 @@
         if count == 1:
             assert srcloc.is_vector_reg()
             assert not resloc.is_vector_reg()
-            off = PARAM_SAVE_AREA_OFFSET
-            self.mc.load_imm(r.SCRATCH2, off)
-            self.mc.stvx(src, r.SCRATCH2.value, r.SP.value)
-            if not IS_BIG_ENDIAN:
-                idx = (16 // size) - 1 - idx
-            off += size * idx
-            if size == 8:
-                self.mc.load(res, r.SP.value, off)
-                return
-            elif size == 4:
-                self.mc.lwa(res, r.SP.value, off)
-                return
-            elif size == 2:
-                self.mc.lha(res, r.SP.value, off)
-                return
-            elif size == 1:
-                self.mc.lbz(res, r.SP.value, off)
-                self.mc.extsb(res, res)
-                return
+            self.mc.VLGV(resloc, srcloc, index, l.itemsize_to_mask(size))
         else:
             # count is not 1, but only 2 is supported for i32
             # 4 for i16 and 8 for i8.
             src = srcloc.value
             res = resloc.value
 
-            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
-            self.mc.stvx(src, r.SCRATCH2.value, r.SP.value)
-            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET+16)
-            self.mc.stvx(res, r.SCRATCH2.value, r.SP.value)
+            #self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
+            #self.mc.stvx(src, r.SCRATCH2.value, r.SP.value)
+            #self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET+16)
+            #self.mc.stvx(res, r.SCRATCH2.value, r.SP.value)
             if count * size == 8:
                 if not IS_BIG_ENDIAN:
                     endian_off = 8
-                off = PARAM_SAVE_AREA_OFFSET
-                off = off + endian_off - (idx * size)
-                assert idx * size + 8 <= 16
-                self.mc.load(r.SCRATCH.value, r.SP.value, off)
-                self.mc.store(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+16+endian_off)
-                self.mc.lvx(res, r.SCRATCH2.value, r.SP.value)
+                #off = PARAM_SAVE_AREA_OFFSET
+                #off = off + endian_off - (idx * size)
+                #assert idx * size + 8 <= 16
+                #self.mc.load(r.SCRATCH.value, r.SP.value, off)
+                #self.mc.store(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+16+endian_off)
+                #self.mc.lvx(res, r.SCRATCH2.value, r.SP.value)
                 return
 
         not_implemented("%d bit integer, count %d" % \
diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -402,10 +402,10 @@
 
         bits = 64
         la = data.draw(st.lists(strat, min_size=10, max_size=150))
-        #la = [1.0] * 10
+        la = [1.0] * 10
         l = len(la)
 
-        accum = 0 #data.draw(strat)
+        accum = data.draw(strat)
         rawstorage = RawStorage()
         va = rawstorage.new(la, type)
         res = self.meta_interp(f, [accum, l*size, va])