[pypy-commit] pypy vecopt-merge: packset combination now maximizes the packset and splits it right after combination into chunks that fit into the vector regsiter, this eliminates the sorting (lets see how this works out)

Thu Aug 27 19:17:31 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt-merge
Changeset: r79262:1b4b6db46742
Date: 2015-08-27 19:17 +0200
http://bitbucket.org/pypy/pypy/changeset/1b4b6db46742/

Log:	packset combination now maximizes the packset and splits it right
	after combination into chunks that fit into the vector regsiter,
	this eliminates the sorting (lets see how this works out)

diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -1073,9 +1073,9 @@
 
     def __repr__(self):
         if self.is_identity():
-            return 'IndexVar(%s+%s)' % (self.var, repr(self.next_nonconst))
+            return 'idx(%s)' % (self.var,)
 
-        return 'IndexVar((%s*(%s/%s)+%s))' % (self.var, self.coefficient_mul,
+        return 'idx(%s*(%s/%s)+%s)' % (self.var, self.coefficient_mul,
                                             self.coefficient_div, self.constant)
 
 class MemoryRef(object):
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -892,15 +892,21 @@
     def leftmost(self):
         return self.operations[0].getoperation()
 
+    def pack_type(self):
+        ptype = self.input_type
+        if self.input_type is None:
+            # load does not have an input type, but only an output type
+            ptype = self.output_type
+        return ptype
+
+    def pack_byte_size(self):
+        return self.pack_type().getsize() * self.opcount()
+
     def is_full(self, vec_reg_size):
         """ If one input element times the opcount is equal
             to the vector register size, we are full!
         """
-        ptype = self.input_type
-        if self.input_type is None:
-            # load does not have an input type, but only an output type
-            ptype = self.output_type
-
+        ptype = self.pack_type()
         op = self.leftmost()
         if op.casts_box():
             cur_bytes = ptype.getsize() * self.opcount()
@@ -908,7 +914,7 @@
             assert cur_bytes <= max_bytes
             return cur_bytes == max_bytes
 
-        bytes = ptype.getsize() * len(self.operations)
+        bytes = self.pack_byte_size()
         assert bytes <= vec_reg_size
         if bytes == vec_reg_size:
             return True
@@ -933,6 +939,20 @@
             node.pack = self
             node.pack_position = i
 
+    def split(self, packlist, vec_reg_size):
+        pack = self
+        pack_type = self.pack_type()
+        max_count = vec_reg_size // pack_type.getsize()
+        assert max_count * pack_type.getsize() == vec_reg_size
+        while pack.pack_byte_size() > vec_reg_size:
+            newpack = pack.clone()
+            oplist = pack.operations[:max_count]
+            newpack.operations = pack.operations[max_count:]
+            pack.operations = oplist
+            pack.update_pack_of_nodes()
+            newpack.update_pack_of_nodes()
+            pack = newpack
+
     def rightmost_match_leftmost(self, other):
         """ Check if pack A can be combined with pack B """
         assert isinstance(other, Pack)
@@ -954,6 +974,12 @@
     def is_accumulating(self):
         return self.accum is not None
 
+    def clone(self):
+        cloned = Pack(self.operations, self.input_type, self.output_type)
+        cloned.accum = self.accum
+        return cloned
+
+
 class Pair(Pack):
     """ A special Pack object with only two statements. """
     def __init__(self, left, right, input_type, output_type):
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -319,16 +319,16 @@
             for node_b,memref_b in memory_refs:
                 if memref_a is memref_b:
                     continue
+                #print "???", memref_a.index_var, memref_b.index_var
                 # instead of compare every possible combination and
                 # exclue a_opidx == b_opidx only consider the ones
                 # that point forward:
-                if node_a.is_before(node_b):
-                    if memref_a.is_adjacent_to(memref_b):
-                        pair = self.packset.can_be_packed(node_a, node_b, None, False)
-                        if pair:
-                            if node_a.op.getopnum() == rop.GETARRAYITEM_RAW:
-                                print "found", memref_a.index_var, memref_b.index_var
-                            self.packset.add_pack(pair)
+                if memref_a.is_adjacent_after(memref_b):
+                    pair = self.packset.can_be_packed(node_a, node_b, None, False)
+                    if pair:
+                        if node_a.op.getopnum() == rop.GETARRAYITEM_RAW:
+                            print " => found", memref_a.index_var, memref_b.index_var
+                        self.packset.add_pack(pair)
 
     def extend_packset(self):
         """ Follow dependency chains to find more candidates to put into
@@ -389,13 +389,13 @@
         """
         if len(self.packset.packs) == 0:
             raise NotAVectorizeableLoop()
-        packsort(self.packset.packs).sort()
-        if not we_are_translated():
-            # ensure we are really sorted!
-            x = 0
-            for i,pack in enumerate(self.packset.packs):
-                assert x <= pack.left.getindex()
-                x = pack.left.getindex()
+        #packsort(self.packset.packs).sort()
+        #if not we_are_translated():
+        #    # ensure we are really sorted!
+        #    x = 0
+        #    for i,pack in enumerate(self.packset.packs):
+        #        assert x <= pack.left.getindex()
+        #        x = pack.left.getindex()
         i = 0
         j = 0
         end_ij = len(self.packset.packs)
@@ -413,31 +413,31 @@
                     pack2 = self.packset.packs[j]
                     # remove intermediate
                     left = pack1.operations[0]
-                    if left in orphan:
-                        # a pack was filled, thus the rhs was put
-                        # into the orphan map.
-                        if orphan[left] is False:
-                            # this pack might be redundant if pack1.right
-                            # is the at the left position in another pack
-                            assert pack1.opcount() == 2
-                            right = pack1.operations[1]
-                            orphan[right] = True
-                            pack1.clear()
-                            del self.packset.packs[i]
-                            end_ij -= 1
-                            continue
-                        else:
-                            # left is not an orphan, this pack proves that
-                            # there might be more packs
-                            del orphan[left]
+                    #if left in orphan:
+                    #    # a pack was filled, thus the rhs was put
+                    #    # into the orphan map.
+                    #    if orphan[left] is False:
+                    #        # this pack might be redundant if pack1.right
+                    #        # is the at the left position in another pack
+                    #        assert pack1.opcount() == 2
+                    #        right = pack1.operations[1]
+                    #        orphan[right] = True
+                    #        pack1.clear()
+                    #        del self.packset.packs[i]
+                    #        end_ij -= 1
+                    #        continue
+                    #    else:
+                    #        # left is not an orphan, this pack proves that
+                    #        # there might be more packs
+                    #        del orphan[left]
                     # check if the pack is already full
-                    if pack1.is_full(self.cpu.vector_register_size):
-                        right = pack1.operations[-1]
-                        # False indicates that the next pair might not
-                        # be needed, because left is already computed
-                        # in another set
-                        orphan[right] = False
-                        break
+                    #if pack1.is_full(self.cpu.vector_register_size):
+                    #    right = pack1.operations[-1]
+                    #    # False indicates that the next pair might not
+                    #    # be needed, because left is already computed
+                    #    # in another set
+                    #    orphan[right] = False
+                    #    break
                     if pack1.rightmost_match_leftmost(pack2):
                         end_ij = self.packset.combine(i,j)
                     else:
@@ -449,7 +449,10 @@
             if len_before == len(self.packset.packs):
                 break
         for pack in self.packset.packs:
-            pack.update_pack_of_nodes()
+            if pack.pack_byte_size() > self.cpu.vector_register_size:
+                pack.split(self.packset.packs, self.cpu.vector_register_size)
+            else:
+                pack.update_pack_of_nodes()
 
         if not we_are_translated():
             # some test cases check the accumulation variables