[pypy-commit] pypy vecopt: rewritten the guard strengthening. it is now independent from vecopt (still contained in the same file though).

Thu May 21 18:26:40 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77454:1f73ac83382c
Date: 2015-05-21 18:26 +0200
http://bitbucket.org/pypy/pypy/changeset/1f73ac83382c/

Log:	rewritten the guard strengthening. it is now independent from vecopt
	(still contained in the same file though). the previous version was
	not correct and could only rewrite int_add. now if a guard implies a
	previous guard, the earlier guard is replaced directly

diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -2,7 +2,7 @@
 
 from rpython.jit.metainterp import compile
 from rpython.jit.metainterp.optimizeopt.util import make_dispatcher_method
-from rpython.jit.metainterp.resoperation import (rop, GuardResOp)
+from rpython.jit.metainterp.resoperation import (rop, GuardResOp, ResOperation)
 from rpython.jit.metainterp.resume import Snapshot
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.metainterp.history import BoxPtr, ConstPtr, ConstInt, BoxInt, Box, Const, BoxFloat
@@ -83,6 +83,9 @@
         self.emitted = False
         self.schedule_position = -1
         self.priority = 0
+        # save the operation that produces the result for the first argument
+        # only for guard_true/guard_false
+        self.guard_bool_bool_node = None
 
     def getoperation(self):
         return self.op
@@ -531,8 +534,7 @@
                 self.build_non_pure_dependencies(node, tracker)
         # pass 2 correct guard dependencies
         for guard_node in self.guards:
-            op = guard_node.getoperation()
-            self.build_guard_dependencies(guard_node, op.getopnum(), tracker)
+            self.build_guard_dependencies(guard_node, tracker)
         # pass 3 find schedulable nodes
         jump_node = self.nodes[jump_pos]
         label_node = self.nodes[label_pos]
@@ -559,9 +561,15 @@
         if guard_opnum in (rop.GUARD_TRUE, rop.GUARD_FALSE):
             for dep in guard_node.depends():
                 op = dep.to.getoperation()
-                for arg in op.getarglist():
-                    if isinstance(arg, Box):
-                        self.guard_exit_dependence(guard_node, arg, tracker)
+                if op.returns_bool_result() and op.result == guard_op.getarg(0):
+                    guard_node.guard_bool_bool_node = dep.to
+                    for arg in op.getarglist():
+                        if isinstance(arg, Box):
+                            self.guard_exit_dependence(guard_node, arg, tracker)
+                    break
+            else:
+                raise RuntimeError("guard_true/false has no operation that " \
+                                   "returns the bool for the arg 0")
         elif guard_op.is_foldable_guard():
             # these guards carry their protected variables directly as a parameter
             for arg in guard_node.getoperation().getarglist():
@@ -598,12 +606,12 @@
             if guard_node.is_before(dep.to) and dep.because_of(var):
                 guard_node.edge_to(dep.to, var, label='guard_exit('+str(var)+')')
 
-    def build_guard_dependencies(self, guard_node, guard_opnum, tracker):
-        if guard_opnum >= rop.GUARD_NOT_INVALIDATED:
+    def build_guard_dependencies(self, guard_node, tracker):
+        guard_op = guard_node.op
+        if guard_op.getopnum() >= rop.GUARD_NOT_INVALIDATED:
             # ignore invalidated & future condition guard & early exit
             return
         # true dependencies
-        guard_op = guard_node.op
         for arg in guard_op.getarglist():
             tracker.depends_on_arg(arg, guard_node)
         # dependencies to uses of arguments it protects
@@ -970,6 +978,33 @@
         othercoeff = other.coefficient_mul // other.coefficient_div
         return mycoeff + self.constant - (othercoeff + other.constant)
 
+    def emit_operations(self, opt):
+        box = self.var
+        if self.coefficient_mul != 1:
+            box_result = box.clonebox()
+            opt.emit_operation(ResOperation(rop.INT_MUL, [box, ConstInt(self.coefficient_mul)], box_result))
+            box = box_result
+        if self.coefficient_div != 1:
+            box_result = box.clonebox()
+            opt.emit_operation(ResOperation(rop.INT_FLOORDIV, [box, ConstInt(self.coefficient_div)], box_result))
+            box = box_result
+        if self.constant != 0:
+            box_result = box.clonebox()
+            opt.emit_operation(ResOperation(rop.INT_ADD, [box, ConstInt(self.constant)], box_result))
+            box = box_result
+        return box
+
+    def compare(self, other):
+        assert isinstance(other, IndexVar)
+        v1 = (self.coefficient_mul // self.coefficient_div) + self.constant
+        v2 = (other.coefficient_mul // other.coefficient_div) + other.constant
+        if v1 == v2:
+            return 0
+        elif v1 < v2:
+            return -1
+        else:
+            return 1
+
     def __repr__(self):
         if self.is_identity():
             return 'IndexVar(%s+%s)' % (self.var, repr(self.next_nonconst))
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -12,7 +12,7 @@
 from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph
 from rpython.jit.metainterp.optimizeopt.unroll import Inliner
 from rpython.jit.metainterp.optimizeopt.vectorize import (VectorizingOptimizer, MemoryRef,
-        isomorphic, Pair, NotAVectorizeableLoop, NotAVectorizeableLoop)
+        isomorphic, Pair, NotAVectorizeableLoop, NotAVectorizeableLoop, GuardStrengthenOpt)
 from rpython.jit.metainterp.optimize import InvalidLoop
 from rpython.jit.metainterp.history import ConstInt, BoxInt, get_const_ptr_for_string
 from rpython.jit.metainterp import executor, compile, resume
@@ -102,7 +102,8 @@
         opt.extend_packset()
         opt.combine_packset()
         opt.schedule()
-        opt.collapse_index_guards()
+        gso = GuardStrengthenOpt(opt.dependency_graph.index_vars)
+        gso.propagate_all_forward(opt.loop)
         return opt
 
     def assert_unroll_loop_equals(self, loop, expected_loop, \
@@ -879,30 +880,6 @@
         vopt = self.schedule(loop,1)
         self.assert_equal(loop, self.parse_loop(vops))
 
-    @pytest.mark.parametrize('unroll', [1])
-    def test_vectorize_index_variable_combination(self, unroll):
-        ops = """
-        [p0,i0]
-        guard_early_exit() []
-        i1 = raw_load(p0, i0, descr=floatarraydescr)
-        i2 = int_add(i0,8)
-        jump(p0,i2)
-        """
-        vops = """
-        [p0,i0]
-        guard_early_exit() []
-        """ + '\n        '.join(["i{x} = int_add(i0,{i})".format(i=8*(i+1),x=i+100) for i in range(unroll) ]) + \
-        """
-        i1 = int_add(i0, {count})
-        v1 = vec_raw_load(p0, i0, {elems}, descr=floatarraydescr)
-        jump(p0,i1)
-        """.format(count=(unroll+1)*8,elems=unroll+1)
-        print vops
-        loop = self.parse_loop(ops)
-        vopt = self.vectorize(loop,unroll)
-        self.assert_equal(loop, self.parse_loop(vops))
-
-
     def test_vschedule_trace_1(self):
         ops = """
         [i0, i1, i2, i3, i4]
@@ -948,18 +925,22 @@
         jump(p0,i2)
         """
         dead_code =  '\n        '.join([
-          "i{t} = int_add(i0,{i})\n        i{s} = int_lt(i{t}, 102)".format(
-              i=i+1, t=i+4, s=i+20)
-          for i in range(0,15)])
+          "i{t1} = int_add(i{t},1)\n        i{s} = int_lt(i{t1}, 102)".format(
+              i=i+1, t1=i+201, t=i+200, s=i+20)
+          for i in range(0,14)])
         opt="""
         [p0,i0]
         guard_early_exit() [p0,i0]
-        {dead_code}
+        i200 = int_add(i0, 1)
+        i400 = int_lt(i200, 102)
         i2 = int_add(i0, 16)
         i3 = int_lt(i2, 102)
         guard_true(i3) [p0,i0]
+        {dead_code}
+        i500 = same_as(i2)
+        i300 = int_lt(i500, 102)
         i1 = vec_getarrayitem_raw(p0, i0, 16, descr=chararraydescr)
-        jump(p0,i2)
+        jump(p0,i500)
         """.format(dead_code=dead_code)
         vopt = self.vectorize(self.parse_loop(ops),15)
         self.assert_equal(vopt.loop, self.parse_loop(opt))
@@ -1001,10 +982,12 @@
         i2 = int_add(i0, 2)
         i3 = int_lt(i2, 10)
         guard_true(i3) [p0,i0]
+        i4 = same_as(i2)
+        i5 = int_lt(i4, 10)
         v1 = vec_getarrayitem_raw(p0, i0, 2, descr=floatarraydescr)
         v3 = vec_int_expand(42)
         v2 = vec_int_mul(v1, v3, 2)
-        jump(p0,i2)
+        jump(p0,i4)
         """
         vopt = self.vectorize(self.parse_loop(ops),1)
         self.assert_equal(vopt.loop, self.parse_loop(opt))
@@ -1028,10 +1011,12 @@
         i2 = int_add(i0, 2)
         i3 = int_lt(i2, 10)
         guard_true(i3) [p0,i0]
+        i4 = same_as(i2)
+        i5 = int_lt(i4, 10)
         v1 = vec_getarrayitem_raw(p0, i0, 2, descr=floatarraydescr)
         v3 = vec_float_expand(f3)
         v2 = vec_int_mul(v1, v3, 2)
-        jump(p0,i2,f3)
+        jump(p0,i4,f3)
         """
         vopt = self.vectorize(self.parse_loop(ops),1)
         self.assert_equal(vopt.loop, self.parse_loop(opt))
@@ -1062,10 +1047,11 @@
         i48 = int_add(i41, 8) 
         i51 = int_add(i37, 8) 
         i52 = int_ge(i50, i18) 
-        i55 = int_add(i44, 16) 
-        i54 = int_add(i41, 16) 
-        i56 = int_add(i37, 16) 
-        i53 = int_add(i28, 2) 
+        guard_false(i52) [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42, i43, f34, i28, p36, i41]
+        i55 = int_add(i46, 8) 
+        i54 = int_add(i48, 8) 
+        i56 = int_add(i51, 8) 
+        i53 = int_add(i50, 1)
         i57 = int_ge(i53, i18) 
         guard_false(i57) [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42, i43, f34, i28, p36, i41]
         v61 = vec_raw_load(i21, i44, 2, descr=floatarraydescr) 
@@ -1106,7 +1092,7 @@
         try:
             vopt = self.vectorize(self.parse_loop(ops))
             self.debug_print_operations(vopt.loop)
-            # TODO verify
+            py.test.fail("this loop should not be vectorized")
         except NotAVectorizeableLoop:
             pass
 
@@ -1117,7 +1103,7 @@
         f1 = getarrayitem_raw(p0, i1, descr=floatarraydescr)
         i2 = cast_float_to_singlefloat(f1)
         setarrayitem_raw(p1, i1, i2, descr=singlefloatarraydescr)
-        i3 = int_add(i1, 1)
+        i3 = int_sub(i1, 1)
         i4 = int_ge(i3, 36)
         guard_false(i4) []
         jump(p0, p1, i3)
@@ -1125,15 +1111,17 @@
         opt = """
         [p0, p1, i1]
         guard_early_exit() []
-        i3 = int_add(i1, 1)
+        i3 = int_sub(i1, 1)
         i4 = int_ge(i3, 36)
-        i5 = int_add(i1, 2)
+        i50 = int_add(i1, -4)
+        i51 = int_ge(i50, 36)
+        guard_false(i51) []
+        i5 = int_sub(i3, 1)
         i8 = int_ge(i5, 36)
-        i6 = int_add(i1, 3)
+        i6 = int_sub(i5, 1)
         i11 = int_ge(i6, 36)
-        i7 = int_add(i1, 4)
+        i7 = same_as(i50)
         i14 = int_ge(i7, 36)
-        guard_false(i14) []
         v17 = vec_getarrayitem_raw(p0, i1, 2, descr=floatarraydescr)
         v18 = vec_getarrayitem_raw(p0, i5, 2, descr=floatarraydescr)
         v19 = vec_cast_float_to_singlefloat(v17, 2)
@@ -1168,16 +1156,18 @@
         i5 = int_add(i4, 4)
         i1 = int_add(i0, 4)
         i186 = int_lt(i5, 100)
-        i189 = int_add(i0, 8)
-        i187 = int_add(i4, 8)
-        i198 = int_add(i0, 12)
+        i500 = int_add(i4, 16)
+        i501 = int_lt(i500, 100)
+        guard_false(i501) []
+        i189 = int_add(i1, 4)
+        i187 = int_add(i5, 4)
+        i198 = int_add(i189, 4)
         i188 = int_lt(i187, 100)
-        i207 = int_add(i0, 16)
-        i196 = int_add(i4, 12)
+        i207 = int_add(i198, 4)
+        i196 = int_add(i187, 4)
         i197 = int_lt(i196, 100)
-        i205 = int_add(i4, 16)
+        i205 = same_as(i500)
         i206 = int_lt(i205, 100)
-        guard_false(i206) []
         v228 = vec_raw_load(p0, i0, 4, descr=singlefloatarraydescr)
         v229 = vec_cast_singlefloat_to_float(v228, 2)
         v230 = vec_int_unpack(v228, 2, 2)
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -9,7 +9,7 @@
 from rpython.jit.metainterp.optimizeopt.optimizer import Optimizer, Optimization
 from rpython.jit.metainterp.optimizeopt.util import make_dispatcher_method
 from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, 
-        MemoryRef, Scheduler, SchedulerData, Node)
+        MemoryRef, Scheduler, SchedulerData, Node, IndexVar)
 from rpython.jit.metainterp.resoperation import (rop, ResOperation, GuardResOp)
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
@@ -111,7 +111,8 @@
         self.combine_packset()
         self.schedule()
 
-        self.collapse_index_guards()
+        gso = GuardStrengthenOpt(self.dependency_graph.index_vars)
+        gso.propagate_all_forward(self.loop)
 
     def emit_operation(self, op):
         if op.getopnum() == rop.DEBUG_MERGE_POINT:
@@ -399,6 +400,8 @@
 
     def unpack_from_vector(self, op, sched_data):
         args = op.getarglist()
+        if op.is_guard():
+            py.test.set_trace()
         for i, arg in enumerate(op.getarglist()):
             if isinstance(arg, Box):
                 self._unpack_from_vector(args, i, arg, sched_data)
@@ -427,9 +430,7 @@
     def analyse_index_calculations(self):
         if len(self.loop.operations) <= 1 or self.early_exit_idx == -1:
             return
-
         self.dependency_graph = graph = DependencyGraph(self.loop)
-
         label_node = graph.getnode(0)
         ee_guard_node = graph.getnode(self.early_exit_idx)
         guards = graph.guards
@@ -437,7 +438,6 @@
             if guard_node is ee_guard_node:
                 continue
             modify_later = []
-            valid_trans = True
             last_prev_node = None
             for path in guard_node.iterate_paths(ee_guard_node, True):
                 prev_node = path.second()
@@ -453,7 +453,7 @@
                         #   |
                         #  (g)
                         # this graph yields 2 paths from (g), thus (a) is
-                        # remembered and skipped
+                        # remembered and skipped the second time visited
                         continue
                     modify_later.append((prev_node, guard_node))
                 else:
@@ -461,10 +461,13 @@
                         path.set_schedule_priority(10)
                         modify_later.append((path.last_but_one(), None))
                     else:
-                        valid_trans = False
+                        # transformation is invalid.
+                        # exit and do not enter else branch!
                         break
                 last_prev_node = prev_node
-            if valid_trans:
+            else:
+                # transformation is valid, modify the graph and execute
+                # this guard earlier
                 for a,b in modify_later:
                     if b is not None:
                         a.remove_edge_to(b)
@@ -478,53 +481,187 @@
                 guard_node.edge_to(ee_guard_node, label='pullup-last-guard')
                 guard_node.relax_guard_to(ee_guard_node)
 
-    def collapse_index_guards(self):
+class Guard(object):
+    """ An object wrapper around a guard. Helps to determine
+        if one guard implies another
+    """
+    def __init__(self, op, cmp_op, lhs, rhs):
+        self.op = op
+        self.cmp_op = cmp_op
+        self.lhs = lhs
+        self.rhs = rhs
+        self.emitted = False
+        self.stronger = False
+
+    def implies(self, guard, opt):
+        print self.cmp_op, "=>", guard.cmp_op, "?"
+        my_key = opt._get_key(self.cmp_op)
+        ot_key = opt._get_key(guard.cmp_op)
+
+        if my_key[1] == ot_key[1]:
+            # same operation
+            lc = self.compare(self.lhs, guard.lhs)
+            rc = self.compare(self.rhs, guard.rhs)
+            print "compare", self.lhs, guard.lhs, lc
+            print "compare", self.rhs, guard.rhs, rc
+            opnum = my_key[1]
+            # x < y  = -1,-2,...
+            # x == y = 0
+            # x > y  = 1,2,...
+            if opnum == rop.INT_LT:
+                return (lc > 0 and rc >= 0) or (lc == 0 and rc >= 0)
+            if opnum == rop.INT_LE:
+                return (lc >= 0 and rc >= 0) or (lc == 0 and rc >= 0)
+            if opnum == rop.INT_GT:
+                return (lc < 0 and rc >= 0) or (lc == 0 and rc > 0)
+            if opnum == rop.INT_GE:
+                return (lc <= 0 and rc >= 0) or (lc == 0 and rc >= 0)
+        return False
+
+    def compare(self, key1, key2):
+        if isinstance(key1, Box):
+            assert isinstance(key2, Box)
+            assert key1 is key2 # key of hash enforces this
+            return 0
+        #
+        if isinstance(key1, ConstInt):
+            assert isinstance(key2, ConstInt)
+            v1 = key1.value
+            v2 = key2.value
+            if v1 == v2:
+                return 0
+            elif v1 < v2:
+                return -1
+            else:
+                return 1
+        #
+        if isinstance(key1, IndexVar):
+            assert isinstance(key2, IndexVar)
+            return key1.compare(key2)
+        #
+        raise RuntimeError("cannot compare: " + str(key1) + " <=> " + str(key2))
+
+    def emit_varops(self, opt, var):
+        if isinstance(var, IndexVar):
+            box = var.emit_operations(opt)
+            opt._same_as[var] = box
+            return box
+        else:
+            return var
+
+    def emit_operations(self, opt):
+        lhs, opnum, rhs = opt._get_key(self.cmp_op)
+        # create trace instructions for the index
+        box_lhs = self.emit_varops(opt, self.lhs)
+        box_rhs = self.emit_varops(opt, self.rhs)
+        box_result = self.cmp_op.result.clonebox()
+        opt.emit_operation(ResOperation(opnum, [box_lhs, box_rhs], box_result))
+        # guard
+        guard = self.op.clone()
+        guard.setarg(0, box_result)
+        opt.emit_operation(guard)
+
+class GuardStrengthenOpt(object):
+    def __init__(self, index_vars):
+        self.index_vars = index_vars
+        self._newoperations = []
+        self._same_as = {}
+
+    def find_compare_guard_bool(self, boolarg, operations, index):
+        i = index - 1
+        # most likely hit in the first iteration
+        while i > 0:
+            op = operations[i]
+            if op.result and op.result == boolarg:
+                return op
+            i -= 1
+
+        raise RuntimeError("guard_true/false first arg not defined")
+
+    def _get_key(self, cmp_op):
+        if cmp_op and rop.INT_LT <= cmp_op.getopnum() <= rop.INT_GE:
+            lhs_arg = cmp_op.getarg(0)
+            rhs_arg = cmp_op.getarg(1)
+            lhs_index_var = self.index_vars.get(lhs_arg, None)
+            rhs_index_var = self.index_vars.get(rhs_arg, None)
+
+            cmp_opnum = cmp_op.getopnum()
+            # get the key, this identifies the guarded operation
+            if lhs_index_var and rhs_index_var:
+                key = (lhs_index_var.getvariable(), cmp_opnum, rhs_index_var.getvariable())
+            elif lhs_index_var:
+                key = (lhs_index_var.getvariable(), cmp_opnum, rhs_arg)
+            elif rhs_index_var:
+                key = (lhs_arg, cmp_opnum, rhs_index_var)
+            else:
+                key = (lhs_arg, cmp_opnum, rhs_arg)
+            return key
+        return None
+
+
+    def get_key(self, guard_bool, operations, i):
+        cmp_op = self.find_compare_guard_bool(guard_bool.getarg(0), operations, i)
+        return self._get_key(cmp_op)
+
+    def propagate_all_forward(self, loop):
+        """ strengthens the guards that protect an integral value """
         strongest_guards = {}
-        strongest_guards_var = {}
-        index_vars = self.dependency_graph.index_vars
-        comparison_vars = self.dependency_graph.comparison_vars
-        operations = self.loop.operations
-        var_for_guard = {}
-        for i in range(len(operations)-1, -1, -1):
+        # index_vars = self.dependency_graph.index_vars
+        # comparison_vars = self.dependency_graph.comparison_vars
+        # the guards are ordered. guards[i] is before guards[j] iff i < j
+        operations = loop.operations
+        last_guard = None
+        for i,op in enumerate(operations):
             op = operations[i]
-            if op.is_guard():
-                for arg in op.getarglist():
-                    var_for_guard[arg] = True
-                    try:
-                        comparison = comparison_vars[arg]
-                        for index_var in list(comparison.getindex_vars()):
-                            if not index_var:
-                                continue
-                            var = index_var.getvariable()
-                            strongest_known = strongest_guards_var.get(var, None)
-                            if not strongest_known:
-                                strongest_guards_var[var] = index_var
-                                continue
-                            if index_var.less(strongest_known):
-                                strongest_guards_var[var] = strongest_known
-                                strongest_guards[op] = strongest_known
-                    except KeyError:
-                        pass
-
+            if op.is_guard() and op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE):
+                cmp_op = self.find_compare_guard_bool(op.getarg(0), operations, i)
+                key = self._get_key(cmp_op)
+                if key:
+                    lhs_arg = cmp_op.getarg(0)
+                    lhs = self.index_vars.get(lhs_arg, lhs_arg)
+                    rhs_arg = cmp_op.getarg(1)
+                    rhs = self.index_vars.get(rhs_arg, rhs_arg)
+                    strongest = strongest_guards.get(key, None)
+                    if not strongest:
+                        strongest_guards[key] = Guard(op, cmp_op, lhs, rhs)
+                    else:
+                        guard = Guard(op, cmp_op, lhs, rhs)
+                        if guard.implies(strongest, self):
+                            guard.stronger = True
+                            strongest_guards[key] = guard
+        #
         last_op_idx = len(operations)-1
-        for op in operations:
-            if op.is_guard():
-                stronger_guard = strongest_guards.get(op, None)
-                if stronger_guard:
-                    # there is a stronger guard
+        for i,op in enumerate(operations):
+            op = operations[i]
+            if op.is_guard() and op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE):
+                key = self.get_key(op, operations, i)
+                if key:
+                    strongest = strongest_guards.get(key, None)
+                    if not strongest or not strongest.stronger:
+                        # If the key is not None and there _must_ be a strongest
+                        # guard. If strongest is None, this operation implies the
+                        # strongest guard that has been already been emitted.
+                        self.emit_operation(op)
+                        continue
+                    elif strongest.emitted:
+                        continue
+                    strongest.emit_operations(self)
+                    strongest.emitted = True
                     continue
-                else:
-                    self.emit_operation(op)
+            if op.result:
+                # emit a same_as op if a box uses the same index variable
+                index_var = self.index_vars.get(op.result, None)
+                box = self._same_as.get(index_var, None)
+                if box:
+                    self.emit_operation(ResOperation(rop.SAME_AS, [box], op.result))
                     continue
-            if op.is_always_pure() and op.result:
-                try:
-                    var_index = index_vars[op.result]
-                    var_index.adapt_operation(op)
-                except KeyError:
-                    pass
             self.emit_operation(op)
 
-        self.loop.operations = self._newoperations[:]
+        loop.operations = self._newoperations[:]
+
+    def emit_operation(self, op):
+        self._newoperations.append(op)
+
 
 def must_unpack_result_to_exec(op, target_op):
     # TODO either move to resop or util