[pypy-commit] pypy vecopt2: starting to modify the dependency construction to track all integral modifications

Tue May 5 09:45:57 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt2
Changeset: r77106:24298bf280c1
Date: 2015-04-08 16:36 +0200
http://bitbucket.org/pypy/pypy/changeset/24298bf280c1/

Log:	starting to modify the dependency construction to track all integral
	modifications

diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -53,6 +53,10 @@
         else:
             self.defs[arg] = [(index,argcell)]
 
+    def redefintions(self, arg):
+        for _def in self.defs[arg]:
+            yield _def[0]
+
     def definition_index(self, arg, index = -1, argcell=None):
         def_chain = self.defs[arg]
         if len(def_chain) == 1:
@@ -103,6 +107,8 @@
         self.integral_mod = IntegralMod()
         self.schedulable_nodes = [0] # label is always scheduleable
         self.build_dependencies()
+        self.index_vars = {}
+        self.guards = []
 
     def build_dependencies(self):
         """ This is basically building the definition-use chain and saving this
@@ -114,7 +120,7 @@
         """
         tracker = DefTracker(self.memory_refs)
         #
-        guards = []
+        intformod = IntegralForwardModification(self.index_vars)
         # pass 1
         for i,op in enumerate(self.operations):
             # the label operation defines all operations at the
@@ -122,6 +128,9 @@
             if op.getopnum() == rop.LABEL:
                 for arg in op.getarglist():
                     tracker.define(arg, 0)
+                    if isinstance(arg, BoxInt):
+                        assert arg not in self.index_vars
+                        self.index_vars[arg] = IndexVar(arg)
                 continue # prevent adding edge to the label itself
             # definition of a new variable
             if op.result is not None:
@@ -133,21 +142,20 @@
                 for arg in op.getarglist():
                     self._def_use(arg, i, tracker)
             elif op.is_guard():
-                guards.append(i)
+                self.guards.append(i)
             else:
                 self._build_non_pure_dependencies(op, i, tracker)
-        #
+            intformod.inspect_operation(op, i)
         # pass 2 correct guard dependencies
-        for guard_idx in guards:
+        for guard_idx in self.guards:
             self._build_guard_dependencies(guard_idx, op.getopnum(), tracker)
         # pass 3 find schedulable nodes
         jump_pos = len(self.operations)-1
         for i,op in enumerate(self.operations):
             if len(self.adjacent_list[i]) == 0:
                 self.schedulable_nodes.append(i)
-            # every leaf instruction points to the jump_op. in theory
-            # every instruction points to jump_op, this is an optimization
-            # to prevent the scheduling of ops before the jump operation
+            # every leaf instruction points to the jump_op. in theory every instruction
+            # points to jump_op. this forces the jump/finish op to be the last operation
             if i != jump_pos:
                 for dep in self.adjacent_list[i]:
                     if dep.idx_to > i:
@@ -170,7 +178,6 @@
         for arg in guard_op.getarglist():
             self._def_use(arg, guard_idx, tracker)
 
-        print "guard[", guard_idx, "]", guard_op
         variables = []
         for dep in self.depends(guard_idx):
             idx = dep.idx_from
@@ -178,18 +185,16 @@
             for arg in op.getarglist():
                 if isinstance(arg, Box):
                     variables.append(arg)
-                    print " + in spe", arg
             if op.result:
                 variables.append(op.result)
-                print " + in spe res", op.result
         #
         for var in variables:
             try:
                 def_idx = tracker.definition_index(var)
                 for dep in self.provides(def_idx):
                     if var in dep.args and dep.idx_to > guard_idx:
-                        print "checking", var, "def at", def_idx, " -> ", dep
-                        print " ==> yes"
+                        #print "checking", var, "def at", def_idx, " -> ", dep
+                        #print " ==> yes"
                         self._put_edge(guard_idx, dep.idx_to, var)
             except KeyError:
                 pass
@@ -198,8 +203,9 @@
         if op.getfailargs():
             for arg in op.getfailargs():
                 try:
-                    def_idx = tracker.definition_index(arg)
-                    #self._put_edge(def_idx, guard_idx, arg)
+                    for def_idx in tracker.redefintions(arg):
+                        self._put_edge(def_idx, guard_idx, arg)
+                        #print "put arg", arg, ":", def_idx, guard_idx,"!!!"
                 except KeyError:
                     assert False
         #
@@ -300,6 +306,7 @@
         return args
 
     def _update_memory_ref(self, op, index, tracker):
+        # deprecated
         if index not in self.memory_refs:
             return
         memref = self.memory_refs[index]
@@ -327,9 +334,10 @@
         assert idx_from != idx_to
         dep = self.directly_depends(idx_from, idx_to)
         if not dep:
-            dep = Dependency(idx_from, idx_to, arg)
-            self.adjacent_list[idx_from].append(dep)
-            self.adjacent_list[idx_to].append(dep)
+            if self.independent(idx_from, idx_to):
+                dep = Dependency(idx_from, idx_to, arg)
+                self.adjacent_list[idx_from].append(dep)
+                self.adjacent_list[idx_to].append(dep)
         else:
             if arg not in dep.args:
                 dep.args.append(arg)
@@ -399,6 +407,7 @@
 
     def directly_depends(self, from_idx, to_idx):
         return self.instr_dependency(from_idx, to_idx)
+
     def instr_dependency(self, from_instr_idx, to_instr_idx):
         # XXX
         """ Does there exist a dependency from the instruction to another?
@@ -512,6 +521,83 @@
     def is_schedulable(self, idx):
         return self.graph.depends_count(idx) == 0
 
+class IntegralForwardModification(object):
+    """ Calculates integral modifications on an integer box. """
+    def __init__(self, index_vars):
+        self.index_vars = index_vars
+
+    def is_const_integral(self, box):
+        if isinstance(box, ConstInt):
+            return True
+        return False
+
+    additive_func_source = """
+    def operation_{name}(self, op, index):
+        box_r = op.result
+        if not box_r:
+            return
+        box_a0 = op.getarg(0)
+        box_a1 = op.getarg(1)
+        if self.is_const_integral(box_a0) and self.is_const_integral(box_a1):
+            idx_ref = IndexVar(box_r)
+            idx_ref.constant = box_a0.getint() {op} box_a1.getint())
+            self.index_vars[box_r] = idx_ref 
+        elif self.is_const_integral(box_a0):
+            idx_ref = self.index_vars[box_a0]
+            idx_ref = idx_ref.clone(box_r)
+            idx_ref.constant {op}= box_a0.getint()
+            self.index_vars[box_r] = idx_ref
+        elif self.is_const_integral(box_a1):
+            idx_ref = self.index_vars[box_a1]
+            idx_ref = idx_ref.clone(box_r)
+            idx_ref.constant {op}= box_a1.getint()
+            self.index_vars[box_r] = idx_ref
+    """
+    exec py.code.Source(additive_func_source.format(name='INT_ADD', 
+                                                    op='+')).compile()
+    exec py.code.Source(additive_func_source.format(name='INT_SUB', 
+                                                    op='-')).compile()
+    del additive_func_source
+
+    multiplicative_func_source = """
+    def operation_{name}(self, op):
+        box_r = op.result
+        if not box_r:
+            return
+        box_a0 = op.getarg(0)
+        box_a1 = op.getarg(1)
+        if self.is_const_integral(box_a0) and self.is_const_integral(box_a1):
+            idx_ref = IndexVar(box_r)
+            idx_ref.constant = box_a0.getint() {cop} box_a1.getint())
+            self.index_vars[box_r] = idx_ref 
+        elif self.is_const_integral(box_a0):
+            idx_ref = self.index_vars[box_a0]
+            idx_ref = idx_ref.clone(box_r)
+            self.coefficient_{tgt} *= box_a0.getint()
+            self.constant {cop}= box_a0.getint()
+            self.index_vars[box_r] = idx_ref
+        elif self.is_const_integral(box_a1):
+            idx_ref = self.index_vars[box_a1]
+            idx_ref = idx_ref.clone(box_r)
+            self.coefficient_{tgt} {op}= box_a1.getint()
+            self.constant {cop}= box_a1.getint()
+            self.index_vars[box_r] = idx_ref
+    """
+    exec py.code.Source(multiplicative_func_source.format(name='INT_MUL', 
+                                                 op='*', tgt='mul',
+                                                 cop='*')).compile()
+    exec py.code.Source(multiplicative_func_source.format(name='INT_FLOORDIV',
+                                                 op='*', tgt='div',
+                                                 cop='/')).compile()
+    exec py.code.Source(multiplicative_func_source.format(name='UINT_FLOORDIV',
+                                                 op='*', tgt='div',
+                                                 cop='/')).compile()
+    del multiplicative_func_source
+
+integral_dispatch_opt = make_dispatcher_method(IntegralForwardModification, 'operation_')
+IntegralForwardModification.inspect_operation = integral_dispatch_opt
+del integral_dispatch_opt
+
 class IntegralMod(object):
     """ Calculates integral modifications on an integer object.
     The operations must be provided in backwards direction and of one
@@ -532,11 +618,6 @@
     def _update_additive(self, i):
         return (i * self.coefficient_mul) / self.coefficient_div
 
-    def is_const_integral(self, box):
-        if isinstance(box, ConstInt):
-            return True
-        return False
-
     additive_func_source = """
     def operation_{name}(self, op):
         box_a0 = op.getarg(0)
@@ -592,6 +673,11 @@
                                                  cop='/')).compile()
     del multiplicative_func_source
 
+    def is_const_integral(self, box):
+        if isinstance(box, ConstInt):
+            return True
+        return False
+
     def update_memory_ref(self, memref):
         memref.constant = self.constant
         memref.coefficient_mul = self.coefficient_mul
@@ -605,6 +691,43 @@
 IntegralMod.inspect_operation = integral_dispatch_opt
 del integral_dispatch_opt
 
+class IndexVar(object):
+    def __init__(self, var):
+        self.var = var
+        self.coefficient_mul = 1
+        self.coefficient_div = 1
+        self.constant = 0
+
+    def __eq__(self, other):
+        if self.same_variable(other):
+            return self.diff(other) == 0
+        return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def clone(self, box):
+        c = IndexVar(box)
+        c.coefficient_mul = self.coefficient_mul
+        c.coefficient_div = self.coefficient_div
+        c.constant = self.constant
+        return c
+
+    def same_variable(self, other):
+        assert isinstance(other, IndexVar)
+        return other.var == self.var
+
+    def diff(self, other):
+        """ calculates the difference as a second parameter """
+        assert isinstance(other, IndexVar)
+        mycoeff = self.coefficient_mul // self.coefficient_div
+        othercoeff = other.coefficient_mul // other.coefficient_div
+        return mycoeff + self.constant - (othercoeff + other.constant)
+
+    def __repr__(self):
+        return 'IndexVar(%s*(%s/%s)+%s)' % (self.var, self.coefficient_mul,
+                                            self.coefficient_div, self.constant)
+
 class MemoryRef(object):
     """ a memory reference to an array object. IntegralMod is able
     to propagate changes to this object if applied in backwards direction.
@@ -616,33 +739,37 @@
 
     will result in the linear combination i0 * (2/1) + 2
     """
-    def __init__(self, array, origin, descr):
+    def __init__(self, array, origin, descr, index_ref, byte_index=False):
         assert descr is not None
         self.array = array
-        self.origin = origin
         self.descr = descr
-        self.coefficient_mul = 1
-        self.coefficient_div = 1
-        self.constant = 0
+        self.index_ref = index_ref
+        self.byte_index = byte_index
 
     def is_adjacent_to(self, other):
         """ this is a symmetric relation """
-        match, off = self.calc_difference(other)
         stride = self.stride()
-        if match and stride != 0:
-            return abs(off) - stride == 0
+        if self.match(other):
+            return abs(self.index_ref.diff(other.index_ref)) - stride == 0
+        return False
+
+    def match(self, other):
+        assert isinstance(other, MemoryRef)
+        if self.array == other.array and self.descr == other.descr:
+            return self.index_ref.same_variable(other.index_ref):
         return False
 
     def stride(self):
         """ the stride in bytes """
+        if not self.byte_index:
+            return 1
         return self.descr.get_item_size_in_bytes()
 
     def is_adjacent_after(self, other):
         """ the asymetric relation to is_adjacent_to """
-        match, off = self.calc_difference(other)
         stride = self.stride()
-        if match and stride != 0:
-            return off == stride # must be equal to the positive stride
+        if self.match(other):
+            return self.index_ref.diff(other.index_ref) == stride
         return False
 
     def indices_can_alias(self, other):
@@ -650,35 +777,21 @@
         self.origin != other.origin, or their
         linear combination point to the same element.
         """
-        match, off = self.calc_difference(other)
-        if match:
-            return abs(off) < self.stride()
+        if self.index_ref.same_variable(other.index_ref):
+            return True
+        stride = self.stride()
+        if self.match(other):
+            return abs(self.index_ref.diff(other.index_ref)) < stride
         return False
 
     def __eq__(self, other):
-        match, off = self.calc_difference(other)
-        if match:
-            return off == 0
+        if self.match(other):
+            return self.index_ref.diff(other.index_ref) == 0
         return False
 
     def __ne__(self, other):
         return not self.__eq__(other)
 
-    def accesses_same_object(self, other):
-        assert isinstance(other, MemoryRef)
-        return self.array == other.array
-
-    def calc_difference(self, other):
-        """ calculates the difference in bytes as second return value """
-        assert isinstance(other, MemoryRef)
-        if self.array == other.array \
-            and self.origin == other.origin:
-            mycoeff = self.coefficient_mul // self.coefficient_div
-            othercoeff = other.coefficient_mul // other.coefficient_div
-            diff = other.constant - self.constant
-            return mycoeff == othercoeff, diff
-        return False, 0
-
     def __repr__(self):
-        return 'MemoryRef(%s*(%s/%s)+%s)' % (self.origin, self.coefficient_mul,
+        return 'MemRef(%s,%s*(%s/%s)+%s)' % (self.array, self.origin, self.coefficient_mul,
                                             self.coefficient_div, self.constant)
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -766,15 +766,25 @@
         self.assert_packset_empty(vopt.packset, len(loop.operations),
                                   [(5,11), (4,10), (6,12)])
 
-    @pytest.mark.parametrize("descr,stride",
-            [('char',1),('float',8),('int',8),('singlefloat',4)])
-    def test_packset_combine_simple(self,descr,stride):
+    @pytest.mark.parametrize("descr", ['char','float','int','singlefloat'])
+    def test_packset_combine_simple(self,descr):
         ops = """
         [p0,i0]
         i3 = getarrayitem_gc(p0, i0, descr={descr}arraydescr)
-        i1 = int_add(i0,{stride})
+        i1 = int_add(i0,1)
         jump(p0,i1)
-        """.format(descr=descr,stride=stride)
+        """.format(descr=descr)
+        loop = self.parse_loop(ops)
+        vopt = self.combine_packset(loop,3)
+        assert len(vopt.vec_info.memory_refs) == 4
+        assert len(vopt.packset.packs) == 1
+        self.assert_pack(vopt.packset.packs[0], (1,3,5,7))
+        ops = """
+        [p0,i0]
+        i3 = getarrayitem_raw(p0, i0, descr={descr}arraydescr)
+        i1 = int_add(i0,1)
+        jump(p0,i1)
+        """.format(descr=descr)
         loop = self.parse_loop(ops)
         vopt = self.combine_packset(loop,3)
         assert len(vopt.vec_info.memory_refs) == 4
@@ -786,15 +796,18 @@
     def test_packset_combine_2_loads_in_trace(self, descr, stride):
         ops = """
         [p0,i0]
-        i3 = getarrayitem_gc(p0, i0, descr={type}arraydescr)
+        i3 = raw_load(p0, i0, descr={type}arraydescr)
         i1 = int_add(i0,{stride})
-        i4 = getarrayitem_gc(p0, i1, descr={type}arraydescr)
+        i4 = raw_load(p0, i1, descr={type}arraydescr)
         i2 = int_add(i1,{stride})
         jump(p0,i2)
         """.format(type=descr,stride=stride)
         loop = self.parse_loop(ops)
         vopt = self.combine_packset(loop,3)
         assert len(vopt.vec_info.memory_refs) == 8
+        print "---"
+        for p in vopt.packset.packs:
+            print p
         assert len(vopt.packset.packs) == 1
         self.assert_pack(vopt.packset.packs[0], (1,3,5,7,9,11,13,15))
 
@@ -856,10 +869,10 @@
         i1 = int_add(i0, {stride})
         i10 = int_le(i1, 128)
         guard_true(i10) []
-        i2 = getarrayitem_gc(p0, i0, descr={descr}arraydescr)
-        i3 = getarrayitem_gc(p1, i0, descr={descr}arraydescr)
+        i2 = raw_load(p0, i0, descr={descr}arraydescr)
+        i3 = raw_load(p1, i0, descr={descr}arraydescr)
         i4 = {op}(i2,i3)
-        setarrayitem_gc(p2, i0, i4, descr={descr}arraydescr)
+        raw_store(p2, i0, i4, descr={descr}arraydescr)
         jump(p0,p1,p2,i1)
         """.format(op=op,descr=descr,stride=stride)
         loop = self.parse_loop(ops)
@@ -896,7 +909,7 @@
         setarrayitem_gc(p2, i0, i4, descr={descr}arraydescr) # 6, 13, 20, 27
         i1 = int_add(i0, {stride}) # 7, 14, 21, 28
         jump(p0,p1,p2,i1) # 29
-        """.format(op=op,descr=descr,stride=stride)
+        """.format(op=op,descr=descr,stride=1) # stride getarray is always 1
         vops = """
         [p0,p1,p2,i0]
         i10 = int_le(i0, 128)
@@ -910,7 +923,7 @@
         v3 = {op}(v1,v2)
         vec_raw_store(p2, i0, v3, 2, descr={descr}arraydescr)
         jump(p0,p1,p2,i12)
-        """.format(op='vec_'+op,descr=descr,stride=stride)
+        """.format(op='vec_'+op,descr=descr,stride=1)
         loop = self.parse_loop(ops)
         vopt = self.schedule(loop,1)
         self.debug_print_operations(vopt.loop)
@@ -959,19 +972,20 @@
         print "_--" * 10
         self.debug_print_operations(vopt.loop)
 
-    def test_vectorize_raw_load_add_index_item_byte_size(self):
+    def test_123(self):
         ops = """
-        [i0, i1, i2, i3, i4, i5, i6, i7]
-        i8 = raw_load(i3, i0, descr=intarraydescr)
-        i9 = raw_load(i4, i0, descr=intarraydescr)
-        i10 = int_add(i8, i9)
-        raw_store(i5, i0, i10, descr=intarraydescr)
-        i12 = int_add(i0, 8)
-        i14 = int_mul(i7, 8)
-        i15 = int_lt(i12, i14)
-        guard_true(i15) [i7, i10, i5, i4, i3, i9, i8, i12]
-        guard_future_condition() []
-        jump(i12, i8, i9, i3, i4, i5, i10, i7)
+        [i0, i1, i2, i3, i4]
+        debug_merge_point(0, 0, '1')
+        i6 = int_mul(i0, 8)
+        i7 = raw_load(i2, i6, descr=intarraydescr)
+        i8 = raw_load(i3, i6, descr=intarraydescr)
+        i9 = int_add(i7, i8)
+        raw_store(i4, i6, i9, descr=intarraydescr)
+        i11 = int_add(i0, 1)
+        i12 = int_lt(i11, i1)
+        guard_true(i12) [i4, i3, i2, i1, i11]
+        debug_merge_point(0, 0, '2')
+        label(i11, i1, i2, i3, i4)
         """
         vopt = self.schedule(self.parse_loop(ops),1)
         self.debug_print_operations(vopt.loop)
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -32,18 +32,15 @@
         else:
             print ""
 
-def must_unpack_result_to_exec(var, op):
+def must_unpack_result_to_exec(op, target_op):
     # TODO either move to resop or util
-    if op.vector == -1:
-        return True
-    if op.getopnum() == rop.RAW_LOAD or \
-       op.getopnum() == rop.GETARRAYITEM_GC or \
-       op.getopnum() == rop.GETARRAYITEM_RAW:
-        return True
-    if op.getopnum() == rop.RAW_STORE or \
-       op.getopnum() == rop.SETARRAYITEM_GC or \
-       op.getopnum() == rop.SETARRAYITEM_RAW:
-        if op.getarg(1) == var:
+    if op.vector != -1:
+        return False
+    return True
+
+def prohibit_packing(op1, op2):
+    if op2.is_array_op():
+        if op2.getarg(1) == op1.result:
             return True
     return False
 
@@ -147,9 +144,7 @@
                     try:
                         value = rename_map[arg]
                         copied_op.setarg(i, value)
-                        print "rename", arg, " to ", value
                     except KeyError:
-                        print "failing", arg, i
                         pass
                 # not only the arguments, but also the fail args need
                 # to be adjusted. rd_snapshot stores the live variables
@@ -191,6 +186,7 @@
                 print "box", box, "=>", value
             except KeyError:
                 print "FAIL:", i, box
+                pass
 
         snapshot = Snapshot(self.clone_snapshot(snapshot.prev, rename_map),
                             new_boxes)
@@ -235,9 +231,24 @@
         self.combine_packset()
         self.schedule()
 
+    def relax_guard_dependencies(self):
+        int_mod = IntegralMod()
+        for idx, guard in self.vec_info.guards.items():
+            int_mod.reset()
+            for dep in self.dependency_graph.depends(idx):
+                op = self.operations[dep.idx_from]
+                if op.returns_bool_result():
+                    for arg in op.getarglist():
+                        if isinstance(arg, Box):
+                            self._track_integral_modification(arg)
+
+    def _track_integral_modification(self, arg):
+        ref = MemoryRef(None, arg, None)
+
     def build_dependency_graph(self):
         self.dependency_graph = \
             DependencyGraph(self.loop.operations, self.vec_info.memory_refs)
+        self.relax_guard_dependencies()
 
     def find_adjacent_memory_refs(self):
         """ the pre pass already builds a hash of memory references and the
@@ -323,6 +334,8 @@
         end_ij = len(self.packset.packs)
         while True:
             len_before = len(self.packset.packs)
+            print "loop", len_before
+            i = 0
             while i < end_ij:
                 while j < end_ij and i < end_ij:
                     if i == j:
@@ -335,6 +348,7 @@
                     elif pack2.rightmost_match_leftmost(pack1):
                         end_ij = self.packset.combine(j,i)
                     j += 1
+                j = 0
                 i += 1
             if len_before == len(self.packset.packs):
                 break
@@ -500,18 +514,25 @@
         into account the benefit of executing this instruction
         as SIMD instruction.
         """
-        savings = -1 # 1 point for loading and 1 point for storing
+        savings = -1
 
-        # without loss of generatlity: only check the left side
+        # without loss of generatlity: only check 'left' operation
         lop = self.operations[lopidx]
         target_op = self.operations[pack.left.opidx]
 
+        if prohibit_packing(lop, target_op):
+            return -1
+
         if not expand_forward:
-            if not must_unpack_result_to_exec(lop.result, target_op):
+            print " backward savings", savings
+            if not must_unpack_result_to_exec(target_op, lop):
                 savings += 1
+            print " => backward savings", savings
         else:
-            if not must_unpack_result_to_exec(target_op.result, lop):
+            print " forward savings", savings
+            if not must_unpack_result_to_exec(target_op, lop):
                 savings += 1
+            print " => forward savings", savings
 
         return savings
 
@@ -595,15 +616,25 @@
     def __init__(self):
         self.smallest_type_bytes = 0
         self.memory_refs = {}
+        self.guards = {}
         self.track_memory_refs = False
         self.index = 0
 
+    guard_source = """
+    def operation_{name}(self, op):
+        if self.track_memory_refs:
+            self.guards[self.index] = op
+    """
+    for op in ['GUARD_TRUE','GUARD_FALSE']:
+            exec py.code.Source(guard_source.format(name=op)).compile()
+    del guard_source
+
     array_access_source = """
     def operation_{name}(self, op):
         descr = op.getdescr()
         if self.track_memory_refs:
             self.memory_refs[self.index] = \
-                    MemoryRef(op.getarg(0), op.getarg(1), op.getdescr())
+                MemoryRef(op.getarg(0), op.getarg(1), op.getdescr(), {elemidx})
         if not descr.is_array_of_pointers():
             byte_count = descr.get_item_size_in_bytes()
             if self.smallest_type_bytes == 0 \
@@ -611,17 +642,17 @@
                 self.smallest_type_bytes = byte_count
     """
     exec py.code.Source(array_access_source
-              .format(name='RAW_LOAD')).compile()
+              .format(name='RAW_LOAD',elemidx=True)).compile()
     exec py.code.Source(array_access_source
-              .format(name='RAW_STORE')).compile()
+              .format(name='RAW_STORE',elemidx=True)).compile()
     exec py.code.Source(array_access_source
-              .format(name='GETARRAYITEM_GC')).compile()
+              .format(name='GETARRAYITEM_GC',elemidx=False)).compile()
     exec py.code.Source(array_access_source
-              .format(name='SETARRAYITEM_GC')).compile()
+              .format(name='SETARRAYITEM_GC',elemidx=False)).compile()
     exec py.code.Source(array_access_source
-              .format(name='GETARRAYITEM_RAW')).compile()
+              .format(name='GETARRAYITEM_RAW',elemidx=False)).compile()
     exec py.code.Source(array_access_source
-              .format(name='SETARRAYITEM_RAW')).compile()
+              .format(name='SETARRAYITEM_RAW',elemidx=False)).compile()
     del array_access_source
 
     def default_operation(self, operation):
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -171,6 +171,11 @@
     def is_vector_arithmetic(self):
         return rop._VEC_ARITHMETIC_FIRST <= self.getopnum() <= rop._VEC_ARITHMETIC_LAST
 
+    def is_array_op(self):
+        on = self.getopnum()
+        return rop.GETARRAYITEM_GC <= on <= rop.VEC_RAW_LOAD or \
+               rop.SETARRAYITEM_GC <= on <= rop.VEC_RAW_STORE
+
     def is_comparison(self):
         return self.is_always_pure() and self.returns_bool_result()
 
@@ -500,9 +505,9 @@
 
     'GETARRAYITEM_GC/2d',
     'GETARRAYITEM_RAW/2d',
-    'GETINTERIORFIELD_GC/2d',
     'RAW_LOAD/2d',
     'VEC_RAW_LOAD/3d',
+    'GETINTERIORFIELD_GC/2d',
     'GETFIELD_GC/1d',
     'GETFIELD_RAW/1d',
     '_MALLOC_FIRST',
@@ -523,10 +528,10 @@
     'INCREMENT_DEBUG_COUNTER/1',
     'SETARRAYITEM_GC/3d',
     'SETARRAYITEM_RAW/3d',
+    'RAW_STORE/3d',
+    'VEC_RAW_STORE/4d',
     'SETINTERIORFIELD_GC/3d',
     'SETINTERIORFIELD_RAW/3d',    # right now, only used by tests
-    'RAW_STORE/3d',
-    'VEC_RAW_STORE/4d',
     'SETFIELD_GC/2d',
     'ZERO_PTR_FIELD/2', # only emitted by the rewrite, clears a pointer field
                         # at a given constant offset, no descr
diff --git a/rpython/jit/metainterp/test/test_vectorize.py b/rpython/jit/metainterp/test/test_vectorize.py
--- a/rpython/jit/metainterp/test/test_vectorize.py
+++ b/rpython/jit/metainterp/test/test_vectorize.py
@@ -8,7 +8,7 @@
 from rpython.rlib.jit import JitDriver, hint, set_param
 from rpython.rlib.objectmodel import compute_hash
 from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rawstorage import (alloc_raw_storage, raw_storage_setitem,
                                      free_raw_storage, raw_storage_getitem)
 
@@ -24,7 +24,7 @@
     def test_vectorize_simple_load_arith_store_mul(self):
         myjitdriver = JitDriver(greens = [],
                                 reds = ['i','d','va','vb','vc'],
-                                vectorize=True)
+                                vectorize=False)
         def f(d):
             va = alloc_raw_storage(d*rffi.sizeof(rffi.SIGNED), zero=True)
             vb = alloc_raw_storage(d*rffi.sizeof(rffi.SIGNED), zero=True)
@@ -96,7 +96,7 @@
         self.check_trace_count(1)
 
     def test_guard(self):
-        pytest.skip()
+        py.test.skip('abc')
         myjitdriver = JitDriver(greens = [],
                                 reds = ['a','b','c'],
                                 vectorize=True)
@@ -117,5 +117,40 @@
         assert res == 42
         self.check_trace_count(1)
 
+    @py.test.mark.parametrize('i',[8])
+    def test_vectorize_array_get_set(self,i):
+        myjitdriver = JitDriver(greens = [],
+                                reds = ['i','d','va','vb','vc'],
+                                vectorize=True)
+        ET = rffi.SIGNED
+        T = lltype.Array(ET, hints={'nolength': True})
+        def f(d):
+            i = 0
+            va = lltype.malloc(T, d, flavor='raw', zero=True)
+            vb = lltype.malloc(T, d, flavor='raw', zero=True)
+            vc = lltype.malloc(T, d, flavor='raw', zero=True)
+            for j in range(d):
+                va[j] = j
+                vb[j] = j
+            while i < d:
+                myjitdriver.can_enter_jit(i=i, d=d, va=va, vb=vb, vc=vc)
+                myjitdriver.jit_merge_point(i=i, d=d, va=va, vb=vb, vc=vc)
+
+                a = va[i]
+                b = vb[i]
+                vc[i] = a+b
+
+                i += 1
+            res = 0
+            for j in range(d):
+                res += intmask(vc[j])
+            lltype.free(va, flavor='raw')
+            lltype.free(vb, flavor='raw')
+            lltype.free(vc, flavor='raw')
+            return res
+        res = self.meta_interp(f, [i])
+        assert res == f(i)
+        self.check_trace_count(1)
+
 class TestLLtype(VectorizeTest, LLJitMixin):
     pass