[pypy-commit] pypy vecopt2: weaker guards are stripped from the trace

Tue May 5 09:46:23 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt2
Changeset: r77128:d16c3d437d4e
Date: 2015-05-01 09:43 +0200
http://bitbucket.org/pypy/pypy/changeset/d16c3d437d4e/

Log:	weaker guards are stripped from the trace quick and dirty
	implementation to remove redundant index calculations (j=i+1;k=j+1
	=> j=i+1;k=i+2) consider to move this into the rewrite optimizer (as
	fijal suggested)

diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -54,7 +54,9 @@
         if exclude_last:
             count -= 1
         while i < count: 
-            if not self.path[i].op.has_no_side_effect():
+            op = self.path[i].getoperation()
+            if not op.has_no_side_effect() \
+               and op.getopnum() != rop.GUARD_EARLY_EXIT:
                 return False
             i += 1
         return True
@@ -62,6 +64,9 @@
     def walk(self, node):
         self.path.append(node)
 
+    def cut_off_at(self, index):
+        self.path = self.path[:index]
+
     def clone(self):
         return Path(self.path[:])
 
@@ -89,26 +94,26 @@
     def getfailarg_set(self):
         op = self.getoperation()
         assert isinstance(op, GuardResOp)
-        args = []
+        args = {}
         if op.getfailargs():
             for arg in op.getfailargs():
-                args.append(arg)
-            return args
+                args[arg] = None
+            return args.keys()
         elif op.rd_snapshot:
             ss = op.rd_snapshot
             assert isinstance(ss, Snapshot)
             while ss:
                 for box in ss.boxes:
-                    args.append(box)
+                    args[box] = None
                 ss = ss.prev
 
-        return args
+        return args.keys()
 
 
     def relax_guard_to(self, guard):
         """ Relaxes a guard operation to an earlier guard. """
         tgt_op = self.getoperation()
-        op = guard
+        op = guard.getoperation()
         assert isinstance(tgt_op, GuardResOp)
         assert isinstance(op, GuardResOp)
         #descr = compile.ResumeAtLoopHeaderDescr()
@@ -237,24 +242,34 @@
                 worklist.append(dep.to)
         return True
 
-    def iterate_paths(self, to, backwards=False):
+    def iterate_paths(self, to, backwards=False, path_max_len=-1):
         """ yield all nodes from self leading to 'to' """
         if self == to:
             return
-        worklist = [(Path([self]),self)]
+        path = Path([self])
+        worklist = [(0, self, 1)]
         while len(worklist) > 0:
-            path,node = worklist.pop()
+            index,node,pathlen = worklist.pop()
             if backwards:
                 iterdir = node.depends()
             else:
                 iterdir = node.provides()
-            for dep in iterdir:
-                cloned_path = path.clone()
-                cloned_path.walk(dep.to)
-                if dep.to == to:
-                    yield cloned_path
+            if index >= len(iterdir):
+                continue
+            else:
+                next_dep = iterdir[index]
+                next_node = next_dep.to
+                index += 1
+                if index < len(iterdir):
+                    worklist.append((index, node, pathlen))
+                path.cut_off_at(pathlen)
+                path.walk(next_node)
+                pathlen += 1
+
+                if next_node is to or (path_max_len > 0 and pathlen >= path_max_len):
+                    yield path
                 else:
-                    worklist.append((cloned_path,dep.to))
+                    worklist.append((0, next_node, pathlen))
 
     def remove_edge_to(self, node):
         i = 0
@@ -661,7 +676,10 @@
             to = dep.to
             node.remove_edge_to(to)
             if not to.emitted and to.depends_count() == 0:
-                self.schedulable_nodes.append(to)
+                if to.pack:
+                    self.schedulable_nodes.append(to)
+                else:
+                    self.schedulable_nodes.insert(0, to)
         node.clear_dependencies()
         node.emitted = True
 
@@ -682,6 +700,18 @@
             var = self.index_vars[arg] = IndexVar(arg)
         return var
 
+    def operation_INT_LT(self, op, node):
+        box_a0 = op.getarg(0)
+        box_a1 = op.getarg(1)
+        left = None
+        right = None
+        if not self.is_const_integral(box_a0):
+            left = self.get_or_create(box_a0)
+        if not self.is_const_integral(box_a1):
+            right = self.get_or_create(box_a1)
+        box_r = op.result
+        self.index_vars[box_r] = IndexGuard(op.getopnum(), left, right)
+
     additive_func_source = """
     def operation_{name}(self, op, node):
         box_r = op.result
@@ -762,6 +792,25 @@
 IntegralForwardModification.inspect_operation = integral_dispatch_opt
 del integral_dispatch_opt
 
+class IndexGuard(object):
+    def __init__(self, opnum, lindex_var, rindex_var):
+        self.opnum = opnum
+        self.lindex_var = lindex_var
+        self.rindex_var = rindex_var
+
+    def getindex_vars(self):
+        if self.lindex_var and self.rindex_var:
+            return (self.lindex_var, self.rindex_var)
+        elif self.lindex_var:
+            return (self.lindex_var,)
+        elif self.rindex_var:
+            return (self.rindex_var,)
+        else:
+            assert False, "integer comparison must have left or right index"
+
+    def adapt_operation(self, op):
+        pass
+
 class IndexVar(object):
     def __init__(self, var):
         self.var = var
@@ -769,6 +818,9 @@
         self.coefficient_div = 1
         self.constant = 0
 
+    def getvariable(self):
+        return self.var
+
     def __eq__(self, other):
         if self.same_variable(other):
             return self.diff(other) == 0
@@ -777,6 +829,10 @@
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def less(self, other):
+        if self.same_variable(other):
+            return self.diff(other) < 0
+
     def clone(self):
         c = IndexVar(self.var)
         c.coefficient_mul = self.coefficient_mul
@@ -799,6 +855,18 @@
         return 'IndexVar(%s*(%s/%s)+%s)' % (self.var, self.coefficient_mul,
                                             self.coefficient_div, self.constant)
 
+    def adapt_operation(self, op):
+        # TODO
+        if self.coefficient_mul == 1 and \
+           self.coefficient_div == 1 and \
+           op.getopnum() == rop.INT_ADD:
+           if isinstance(op.getarg(0), Box) and isinstance(op.getarg(1), Const):
+               op.setarg(0, self.var)
+               op.setarg(1, ConstInt(self.constant))
+           elif isinstance(op.getarg(1), Box) and isinstance(op.getarg(0), Const):
+               op.setarg(1, self.var)
+               op.setarg(0, ConstInt(self.constant))
+
 class MemoryRef(object):
     """ a memory reference to an array object. IntegralForwardModification is able
     to propagate changes to this object if applied in backwards direction.
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
@@ -45,7 +45,7 @@
                 node_b = graph.getnode(idx_b)
                 dependency = node_a.getedge_to(node_b)
                 if dependency is None and idx_b not in exceptions.setdefault(idx,[]):
-                    #self._write_dot_and_convert_to_svg(graph, graph.nodes, 'except')
+                    self._write_dot_and_convert_to_svg(graph, 'except')
                     assert dependency is not None, \
                        " it is expected that instruction at index" + \
                        " %s depends on instr on index %s but it does not.\n%s" \
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -26,7 +26,7 @@
 
 class VecTestHelper(DependencyBaseTest):
 
-    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unfold"
+    enable_opts = "intbounds:rewrite:virtualize:string:earlyforce:pure:heap"
 
     jitdriver_sd = FakeJitDriverStaticData()
 
@@ -57,6 +57,10 @@
             raise NotAVectorizeableLoop()
         if unroll_factor == -1:
             unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE)
+        opt.analyse_index_calculations()
+        if opt.dependency_graph is not None:
+            self._write_dot_and_convert_to_svg(opt.dependency_graph, "ee" + self.test_name)
+            opt.schedule()
         opt.unroll_loop_iterations(loop, unroll_factor)
         opt.loop.operations = opt.get_newoperations()
         opt.clear_newoperations()
@@ -91,6 +95,16 @@
         opt.schedule()
         return opt
 
+    def vectorize(self, loop, unroll_factor = -1):
+        opt = self.vectoroptimizer_unrolled(loop, unroll_factor)
+        opt.find_adjacent_memory_refs()
+        opt.extend_packset()
+        opt.combine_packset()
+        opt.schedule()
+        opt.collapse_index_guards()
+        self._do_optimize_loop(loop, {}, export_state=False)
+        return opt
+
     def assert_unroll_loop_equals(self, loop, expected_loop, \
                      unroll_factor = -1):
         vectoroptimizer = self.vectoroptimizer_unrolled(loop, unroll_factor)
@@ -696,12 +710,12 @@
         loop = self.parse_loop(ops)
         vopt = self.extend_packset(loop,1)
         assert len(vopt.dependency_graph.memory_refs) == 4
+        self.assert_independent(4,10)
         self.assert_independent(5,11)
         self.assert_independent(6,12)
-        self.assert_independent(7,13)
         assert len(vopt.packset.packs) == 3
         self.assert_packset_empty(vopt.packset, len(loop.operations),
-                                  [(6,12), (5,11), (7,13)])
+                                  [(5,11), (4,10), (6,12)])
 
     @pytest.mark.parametrize("descr", ['char','float','int','singlefloat'])
     def test_packset_combine_simple(self,descr):
@@ -853,8 +867,8 @@
         i11 = int_le(i1, 128)
         guard_true(i11) []
         i12 = int_add(i1, {stride})
+        v2 = vec_getarrayitem_raw(p1, i0, 2, descr={descr}arraydescr)
         v1 = vec_getarrayitem_raw(p0, i0, 2, descr={descr}arraydescr)
-        v2 = vec_getarrayitem_raw(p1, i0, 2, descr={descr}arraydescr)
         v3 = {op}(v1,v2,2)
         vec_setarrayitem_raw(p2, i0, v3, 2, descr={descr}arraydescr)
         jump(p0,p1,p2,i12)
@@ -919,6 +933,7 @@
         """
         opt="""
         [i0, i1, i2, i3, i4]
+        i6 = int_mul(i0, 8) 
         i11 = int_add(i0, 1) 
         i12 = int_lt(i11, i1) 
         guard_true(i12) []
@@ -926,9 +941,8 @@
         i13 = int_add(i11, 1) 
         i18 = int_lt(i13, i1) 
         guard_true(i18) []
-        i6 = int_mul(i0, 8) 
+        v20 = vec_raw_load(i3, i6, 2, descr=intarraydescr) 
         v19 = vec_raw_load(i2, i6, 2, descr=intarraydescr) 
-        v20 = vec_raw_load(i3, i6, 2, descr=intarraydescr) 
         v21 = vec_int_add(v19, v20, 2) 
         vec_raw_store(i4, i6, v21, 2, descr=intarraydescr) 
         jump(i13, i1, i2, i3, i4)
@@ -976,22 +990,27 @@
     def test_collapse_index_guard_1(self):
         ops = """
         [p0,i0]
-        guard_early_exit() []
+        guard_early_exit() [p0,i0]
         i1 = getarrayitem_raw(p0, i0, descr=intarraydescr)
         i2 = int_add(i0, 1)
         i3 = int_lt(i2, 102)
         guard_true(i3) [p0,i0]
         jump(p0,i2)
         """
+        dead_code =  '\n        '.join([
+          "i{t} = int_add(i0,{i})\n        i{s} = int_lt(i{t}, 102)".format(
+              i=i+1, t=i+4, s=i+20)
+          for i in range(0,15)])
         opt="""
         [p0,i0]
+        {dead_code}
         i2 = int_add(i0, 16)
         i3 = int_lt(i2, 102)
         guard_true(i3) [p0,i0]
         i1 = vec_getarrayitem_raw(p0, i0, 16, descr=intarraydescr)
         jump(p0,i2)
-        """
-        vopt = self.schedule(self.parse_loop(ops),15)
+        """.format(dead_code=dead_code)
+        vopt = self.vectorize(self.parse_loop(ops),15)
         self.assert_equal(vopt.loop, self.parse_loop(opt))
 
 
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -53,19 +53,22 @@
         def_opt = Optimizer(metainterp_sd, jitdriver_sd, loop, optimizations)
         def_opt.propagate_all_forward()
 
+#class CollapseGuardOptimization(Optimization):
+#    def __init__(self, index_vars = None):
+#        self.index_vars = index_vars or {}
+#
+#    def propagate_forward(
+
 class VectorizingOptimizer(Optimizer):
     """ Try to unroll the loop and find instructions to group """
 
     def __init__(self, metainterp_sd, jitdriver_sd, loop, optimizations):
         Optimizer.__init__(self, metainterp_sd, jitdriver_sd, loop, optimizations)
-        self.memory_refs = []
         self.dependency_graph = None
-        self.first_debug_merge_point = False
         self.packset = None
         self.unroll_count = 0
         self.smallest_type_bytes = 0
-        self.early_exit = None
-        self.future_condition = None
+        self.early_exit_idx = -1
 
     def propagate_all_forward(self, clear=True):
         self.clear_newoperations()
@@ -74,7 +77,6 @@
         if jump.getopnum() not in (rop.LABEL, rop.JUMP):
             # compile_loop appends a additional label to all loops
             # we cannot optimize normal traces
-            assert False
             raise NotAVectorizeableLoop()
 
         self.linear_find_smallest_type(self.loop)
@@ -85,6 +87,12 @@
             # we cannot optimize normal traces (if there is no label)
             raise NotAVectorizeableLoop()
 
+        # find index guards and move to the earliest position
+        self.analyse_index_calculations()
+        if self.dependency_graph is not None:
+            self.schedule() # reorder the trace
+
+
         # unroll
         self.unroll_count = self.get_unroll_count(vsize)
         self.unroll_loop_iterations(self.loop, self.unroll_count)
@@ -96,12 +104,13 @@
         self.find_adjacent_memory_refs()
         self.extend_packset()
         self.combine_packset()
-        self.collapse_index_guards()
         self.schedule()
 
+        self.collapse_index_guards()
+
     def emit_operation(self, op):
-        if op.getopnum() == rop.GUARD_EARLY_EXIT or \
-           op.getopnum() == rop.DEBUG_MERGE_POINT:
+        #if op.getopnum() == rop.GUARD_EARLY_EXIT or \
+        if op.getopnum() == rop.DEBUG_MERGE_POINT:
             return
         self._last_emitted_op = op
         self._newoperations.append(op)
@@ -114,6 +123,7 @@
         """ Unroll the loop X times. unroll_count is an integral how
         often to further unroll the loop.
         """
+
         op_count = len(loop.operations)
 
         label_op = loop.operations[0].clone()
@@ -125,7 +135,7 @@
             jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None, label_op.getdescr())
         else:
             jump_op = jump_op.clone()
-            jump_op.setdescr(label_op.getdescr())
+            #jump_op.setdescr(label_op.getdescr())
         assert jump_op.is_final()
 
         self.emit_unrolled_operation(label_op)
@@ -134,12 +144,11 @@
         #self.emit_unrolled_operation(guard_ee_op)
 
         operations = []
+        start_index = 1
         for i in range(1,op_count-1):
             op = loop.operations[i].clone()
-            if loop.operations[i].getopnum() == rop.GUARD_FUTURE_CONDITION:
-                pass
             if loop.operations[i].getopnum() == rop.GUARD_EARLY_EXIT:
-                self.future_condition = op
+                continue
             operations.append(op)
             self.emit_unrolled_operation(op)
 
@@ -157,11 +166,13 @@
                 if la != ja:
                     rename_map[la] = ja
             #
+            emitted_ee = False
             for op in operations:
                 if op.getopnum() == rop.GUARD_FUTURE_CONDITION:
                     continue # do not unroll this operation twice
                 if op.getopnum() == rop.GUARD_EARLY_EXIT:
-                    continue # do not unroll this operation twice
+                    emitted_ee = True
+                    pass # do not unroll this operation twice
                 copied_op = op.clone()
                 if copied_op.result is not None:
                     # every result assigns a new box, thus creates an entry
@@ -180,7 +191,7 @@
                 # not only the arguments, but also the fail args need
                 # to be adjusted. rd_snapshot stores the live variables
                 # that are needed to resume.
-                if copied_op.is_guard():
+                if copied_op.is_guard() and emitted_ee:
                     assert isinstance(copied_op, GuardResOp)
                     snapshot = self.clone_snapshot(copied_op.rd_snapshot, rename_map)
                     copied_op.rd_snapshot = snapshot
@@ -231,6 +242,8 @@
     def linear_find_smallest_type(self, loop):
         # O(#operations)
         for i,op in enumerate(loop.operations):
+            if op.getopnum() == rop.GUARD_EARLY_EXIT:
+                self.early_exit_idx = i
             if op.is_array_op():
                 descr = op.getdescr()
                 if not descr.is_array_of_pointers():
@@ -250,7 +263,6 @@
 
     def build_dependency_graph(self):
         self.dependency_graph = DependencyGraph(self.loop.operations)
-        self.relax_index_guards()
 
     def find_adjacent_memory_refs(self):
         """ the pre pass already builds a hash of memory references and the
@@ -346,12 +358,11 @@
                 break
 
     def schedule(self):
+        self.guard_early_exit = -1
         self.clear_newoperations()
         scheduler = Scheduler(self.dependency_graph, VecScheduleData())
-        #dprint("scheduling loop. scheduleable are: " + str(scheduler.schedulable_nodes))
         while scheduler.has_more():
             candidate = scheduler.next()
-            #dprint("  candidate", candidate, "has pack?", candidate.pack != None, "pack", candidate.pack)
             if candidate.pack:
                 pack = candidate.pack
                 if scheduler.schedulable(pack.operations):
@@ -362,8 +373,6 @@
                 else:
                     scheduler.schedule_later(0)
             else:
-                if candidate.getopnum() == rop.GUARD_EARLY_EXIT:
-                    pass
                 position = len(self._newoperations)
                 self.emit_operation(candidate.getoperation())
                 scheduler.schedule(0, position)
@@ -372,69 +381,90 @@
             for node in self.dependency_graph.nodes:
                 assert node.emitted
         self.loop.operations = self._newoperations[:]
+        self.clear_newoperations()
 
-    def relax_index_guards(self):
-        label_idx = 0
-        early_exit_idx = 1
-        label = self.dependency_graph.getnode(label_idx)
-        ee_guard = self.dependency_graph.getnode(early_exit_idx)
-        if not ee_guard.is_guard_early_exit():
-            return # cannot relax
+    def analyse_index_calculations(self):
+        if len(self.loop.operations) <= 1 or self.early_exit_idx == -1:
+            return
 
-        #self.early_exit = ee_guard
+        self.dependency_graph = dependencies = DependencyGraph(self.loop.operations)
 
-        for guard_node in self.dependency_graph.guards:
-            if guard_node == ee_guard:
-                continue
-            if guard_node.getopnum() not in (rop.GUARD_TRUE,rop.GUARD_FALSE):
+        label_node = dependencies.getnode(0)
+        ee_guard_node = dependencies.getnode(self.early_exit_idx)
+        guards = dependencies.guards
+        fail_args = []
+        for guard_node in guards:
+            if guard_node is ee_guard_node:
                 continue
             del_deps = []
             pullup = []
-            iterb = guard_node.iterate_paths(ee_guard, True)
             last_prev_node = None
-            for path in iterb:
+            for path in guard_node.iterate_paths(ee_guard_node, True):
                 prev_node = path.second()
-                if fail_args_break_dependency(guard_node, prev_node, ee_guard):
+                if fail_args_break_dependency(guard_node, prev_node, ee_guard_node):
                     if prev_node == last_prev_node:
                         continue
-                    dprint("relax) ", prev_node, "=>", guard_node)
-                    del_deps.append((prev_node,guard_node))
+                    del_deps.append((prev_node, guard_node))
                 else:
-                    pullup.append(path)
+                    if path.has_no_side_effects(exclude_first=True, exclude_last=True):
+                        #index_guards[guard.getindex()] = IndexGuard(guard, path.path[:])
+                        pullup.append(path.last_but_one())
                 last_prev_node = prev_node
             for a,b in del_deps:
                 a.remove_edge_to(b)
-            for candidate in pullup:
-                lbo = candidate.last_but_one()
-                if candidate.has_no_side_effects(exclude_first=True, exclude_last=True):
-                    ee_guard.remove_edge_to(lbo)
-                    label.edge_to(lbo, label='pullup')
-            guard_node.edge_to(ee_guard, label='pullup')
-            label.remove_edge_to(ee_guard)
-
-            guard_node.relax_guard_to(self.future_condition)
+            for lbo in pullup:
+                if lbo is ee_guard_node:
+                    continue
+                ee_guard_node.remove_edge_to(lbo)
+                label_node.edge_to(lbo, label='pullup')
+            # only the last guard needs a connection
+            guard_node.edge_to(ee_guard_node, label='pullup-last-guard')
+            guard_node.relax_guard_to(ee_guard_node)
 
     def collapse_index_guards(self):
-        pass
-        #final_ops = []
-        #last_guard = None
-        #is_after_relax = False
-        #for op in self._newoperations:
-        #    if op.getopnum() == rop.GUARD_EARLY_EXIT:
-        #        assert last_guard is not None
-        #        final_ops.append(last_guard)
-        #        is_after_relax = True
-        #        continue
-        #    if not is_after_relax:
-        #        if op.is_guard():
-        #            last_guard = op
-        #        else:
-        #            final_ops.append(op)
-        #    else:
-        #        final_ops.append(op)
-        #assert is_after_relax
-        #return final_ops
+        strongest_guards = {}
+        strongest_guards_var = {}
+        index_vars = self.dependency_graph.index_vars
+        operations = self.loop.operations
+        var_for_guard = {}
+        for i in range(len(operations)-1, -1, -1):
+            op = operations[i]
+            if op.is_guard():
+                for arg in op.getarglist():
+                    var_for_guard[arg] = True
+                    try:
+                        comparison = index_vars[arg]
+                        for index_var in comparison.getindex_vars():
+                            var = index_var.getvariable()
+                            strongest_known = strongest_guards_var.get(var, None)
+                            if not strongest_known:
+                                strongest_guards_var[var] = index_var
+                                continue
+                            if index_var.less(strongest_known):
+                                strongest_guards_var[var] = strongest_known
+                                strongest_guards[op] = strongest_known
+                    except KeyError:
+                        pass
 
+        last_op_idx = len(operations)-1
+        for op in operations:
+            if op.is_guard():
+                stronger_guard = strongest_guards.get(op, None)
+                if stronger_guard:
+                    # there is a stronger guard
+                    continue
+                else:
+                    self.emit_operation(op)
+                    continue
+            if op.is_always_pure() and op.result:
+                try:
+                    var_index = index_vars[op.result]
+                    var_index.adapt_operation(op)
+                except KeyError:
+                    pass
+            self.emit_operation(op)
+
+        self.loop.operations = self._newoperations[:]
 
 def must_unpack_result_to_exec(op, target_op):
     # TODO either move to resop or util
@@ -445,7 +475,6 @@
 def prohibit_packing(op1, op2):
     if op1.is_array_op():
         if op1.getarg(1) == op2.result:
-            dprint("prohibit)", op1, op2)
             return True
     return False