[pypy-commit] pypy vecopt-merge: forcing memo of variables while parsing to the same memo at runtime (only jitviewer and the test suite affected)

Wed Sep 16 18:06:59 CEST 2015

Author: Richard Plangger <planrichi at gmail.com>
Branch: vecopt-merge
Changeset: r79658:fe1eb22de735
Date: 2015-09-16 18:06 +0200
http://bitbucket.org/pypy/pypy/changeset/fe1eb22de735/

Log:	forcing memo of variables while parsing to the same memo at runtime
	(only jitviewer and the test suite affected) poking costmodel tests,
	accumulator in the algorthim missing

diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -356,7 +356,7 @@
         pack = ''
         if self.pack:
             pack = "p: %d" % self.pack.numops()
-        return "Node(%s,%s i: %d)" % (self.op.getopname(), pack, self.opidx)
+        return "Node(%s,%s i: %d)" % (self.op, pack, self.opidx)
 
     def __ne__(self, other):
         return not self.__eq__(other)
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -7,6 +7,7 @@
 from rpython.jit.metainterp.optimizeopt.renamer import Renamer
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.metainterp.jitexc import NotAProfitableLoop
+from rpython.rlib.objectmodel import specialize
 
 
 class SchedulerState(object):
@@ -78,7 +79,7 @@
         state.renamer.rename(op)
         if unpack:
             state.ensure_args_unpacked(op)
-        node.position = len(state.oplist)
+        node.vector=Trueposition = len(state.oplist)
         worklist = state.worklist
         for dep in node.provides()[:]: # COPY
             to = dep.to
@@ -131,115 +132,6 @@
             for node in state.graph.nodes:
                 assert node.emitted
 
-#UNSIGNED_OPS = (rop.UINT_FLOORDIV, rop.UINT_RSHIFT,
-#                rop.UINT_LT, rop.UINT_LE,
-#                rop.UINT_GT, rop.UINT_GE)
-
-#class Type(object):
-#    """ The type of one operation. Saves type, size and sign. """
-#    @staticmethod
-#    def of(op):
-#        descr = op.getdescr()
-#        if descr:
-#            type = INT
-#            if descr.is_array_of_floats() or descr.concrete_type == FLOAT:
-#                type = FLOAT
-#            size = descr.get_item_size_in_bytes()
-#            sign = descr.is_item_signed()
-#            return Type(type, size, sign)
-#        else:
-#            size = 8
-#            sign = True
-#            if op.type == 'f' or op.getopnum() in UNSIGNED_OPS:
-#                sign = False
-#            return Type(op.type, size, sign)
-#
-#    def __init__(self, type, size, signed):
-#        assert type in (FLOAT, INT)
-#        self.type = type
-#        self.size = size
-#        self.signed = signed
-#
-#    def bytecount(self):
-#        return self.size
-#
-#    def clone(self):
-#        return Type(self.type, self.size, self.signed)
-#
-#    def __repr__(self):
-#        sign = '-'
-#        if not self.signed:
-#            sign = '+'
-#        return 'Type(%s%s, %d)' % (sign, self.type, self.size)
-#
-    #UNKNOWN_TYPE = '-'
-
-    #@staticmethod
-    #def of(box, count=-1):
-    #    assert box.type == 'V'
-    #    if count == -1:
-    #        count = box.getcount()
-    #    return Type(box.gettype(), box.getsize(), box.getsigned(), count)
-
-    #@staticmethod
-    #def by_descr(descr, vec_reg_size):
-    #    _t = INT
-    #    signed = descr.is_item_signed()
-    #    if descr.is_array_of_floats() or descr.concrete_type == FLOAT:
-    #        _t = FLOAT
-    #        signed = False
-    #    size = descr.get_item_size_in_bytes()
-    #    pt = Type(_t, size, signed, vec_reg_size // size)
-    #    return pt
-
-    #def clone(self):
-    #    return Type(self.type, self.size, self.signed, self.count)
-
-    #def new_vector_box(self, count = -1):
-    #    if count == -1:
-    #        count = self.count
-    #    assert count > 1
-    #    assert self.type in ('i','f')
-    #    assert self.size > 0
-    #    xxx
-    #    return BoxVector(self.type, count, self.size, self.signed)
-
-    #def combine(self, other):
-    #    """ nothing to be done here """
-    #    if not we_are_translated():
-    #        assert self.type == other.type
-    #        assert self.signed == other.signed
-
-
-    #def byte_size(self):
-    #    return self.count * self.size
-
-    #def setsize(self, size):
-    #    self.size = size
-
-    #def setcount(self, count):
-    #    self.count = count
-
-    #def gettype(self):
-    #    return self.type
-
-    #def getsize(self):
-    #    return self.size
-
-    #def getcount(self):
-    #    return self.count
-
-
-
-class TypeOutput(object):
-    def __init__(self, type, count):
-        self.type = type
-        self.count = count
-
-
-    def bytecount(self):
-        return self.count * self.type.bytecount()
-
 class TypeRestrict(object):
     ANY_TYPE = -1
     ANY_SIZE = -1
@@ -273,13 +165,6 @@
     TR_LONG = TypeRestrict(INT, 8, 2)
     TR_INT_2 = TypeRestrict(INT, 4, 2)
 
-    #INT = OpToVectorOp((TR_ANY_INTEGER, TR_ANY_INTEGER), DT_PASS)
-    #FLOAT = OpToVectorOp((TR_ANY_FLOAT, TR_ANY_FLOAT), DT_PASS)
-    #FLOAT_UNARY = OpToVectorOp((TR_ANY_FLOAT,), DT_PASS)
-    #LOAD = LoadToVectorLoad()
-    #STORE = StoreToVectorStore()
-    #GUARD = PassThroughOp((TR_ANY_INTEGER,))
-
     # note that the following definition is x86 arch specific
     MAPPING = {
         rop.VEC_INT_ADD:            [TR_ANY_INTEGER, TR_ANY_INTEGER],
@@ -318,11 +203,6 @@
         rop.VEC_INT_IS_TRUE:        [TR_ANY_INTEGER,TR_ANY_INTEGER],
     }
 
-    # TODO?
-    UNSIGNED_OPS = (rop.UINT_FLOORDIV, rop.UINT_RSHIFT,
-                    rop.UINT_LT, rop.UINT_LE,
-                    rop.UINT_GT, rop.UINT_GE)
-
 def turn_into_vector(state, pack):
     """ Turn a pack into a vector instruction """
     #
@@ -412,39 +292,6 @@
         #   self.input_type.getsize() != vecop.getsize():
         #    vecop = self.extend(vecop, self.input_type)
 
-        # use the input as an indicator for the pack type
-        #packable = vecop.maximum_numops()
-        #packed = vecop.count
-        #assert packed >= 0
-        #assert packable >= 0
-        #if packed > packable:
-        #    # the argument has more items than the operation is able to process!
-        #    # pos == 0 then it is already at the right place
-        #    if pos != 0:
-        #        args[i] = self.unpack(vecop, pos, packed - pos, self.input_type)
-        #        state.remember_args_in_vector(i, args[i])
-        #        #self.update_input_output(self.pack)
-        #        continue
-        #    else:
-        #        assert vecop is not None
-        #        args[i] = vecop
-        #        continue
-        #vboxes = self.vector_boxes_for_args(i)
-        #if packed < packable and len(vboxes) > 1:
-        #    # the argument is scattered along different vector boxes
-        #    args[i] = self.gather(vboxes, packable)
-        #    state.remember_args_in_vector(i, args[i])
-        #    continue
-        #if pos != 0:
-        #    # The vector box is at a position != 0 but it
-        #    # is required to be at position 0. Unpack it!
-        #    args[i] = self.unpack(vecop, pos, packed - pos, self.input_type)
-        #    state.remember_args_in_vector(i, args[i])
-        #    continue
-        ##
-        #assert vecop is not None
-        #args[i] = vecop
-
 def check_if_pack_supported(self, pack):
     op0 = pack.operations[0].getoperation()
     if self.input_type is None:
@@ -461,25 +308,6 @@
             # see assembler for comment why
             raise NotAProfitableLoop
 
-def extend(self, vbox, newtype):
-    assert vbox.gettype() == newtype.gettype()
-    if vbox.gettype() == INT:
-        return self.extend_int(vbox, newtype)
-    else:
-        raise NotImplementedError("cannot yet extend float")
-
-def extend_int(self, vbox, newtype):
-    vbox_cloned = newtype.new_vector_box(vbox.getcount())
-    self.sched_data._prevent_signext(newtype.getsize(), vbox.getsize())
-    newsize = newtype.getsize()
-    assert newsize > 0
-    op = ResOperation(rop.VEC_INT_SIGNEXT, 
-                      [vbox, ConstInt(newsize)],
-                      vbox_cloned)
-    self.costmodel.record_cast_int(vbox.getsize(), newtype.getsize(), vbox.getcount())
-    self.vecops.append(op)
-    return vbox_cloned
-
 def unpack_from_vector(state, arg, index, count):
     """ Extract parts of the vector box into another vector box """
     print "unpack i", index, "c", count, "v", arg
@@ -556,9 +384,6 @@
         if variables is not None:
             variables.append(vecop)
         state.expand([arg], vecop)
-        #expanded_map.setdefault(arg,[]).append((vecop, -1))
-        #for i in range(vecop.count):
-        #    state.setvector_of_box(arg, i, vecop)
         args[index] = vecop
         return vecop
 
@@ -642,7 +467,7 @@
         SchedulerState.post_schedule(self)
 
         # add accumulation info to the descriptor
-        #for version in self.loop.versions:
+        # TODO for version in self.loop.versions:
         #    # this needs to be done for renamed (accum arguments)
         #    version.renamed_inputargs = [ renamer.rename_map.get(arg,arg) for arg in version.inputargs ]
         #self.appended_arg_count = len(sched_data.invariant_vector_vars)
@@ -717,7 +542,7 @@
                 if argument and not argument.is_constant():
                     arg = self.ensure_unpacked(i, argument)
                     if argument is not arg:
-                        fail_arguments[i] = arg
+                        fail_args[i] = arg
 
     def ensure_unpacked(self, index, arg):
         if arg in self.seen or arg.is_vector():
@@ -756,9 +581,8 @@
                 break
             self.setvector_of_box(arg, i, box)
 
-
 def opcount_filling_vector_register(pack, vec_reg_size):
-    """ how many operations of that kind can one execute
+    """ How many operations of that kind can one execute
         with a machine instruction of register size X?
     """
     op = pack.leftmost()
@@ -790,10 +614,16 @@
     def numops(self):
         return len(self.operations)
 
-    def leftmost(self):
+    @specialize.arg(1)
+    def leftmost(self, node=False):
+        if node:
+            return self.operations[0]
         return self.operations[0].getoperation()
 
-    def rightmost(self):
+    @specialize.arg(1)
+    def rightmost(self, node=False):
+        if node:
+            return self.operations[-1]
         return self.operations[-1].getoperation()
 
     def pack_type(self):
@@ -933,7 +763,7 @@
     def __repr__(self):
         if len(self.operations) == 0:
             return "Pack(empty)"
-        return "Pack(%dx %s)" % (self.numops(), self.operations[0])
+        return "Pack(%dx %s)" % (self.numops(), self.operations)
 
     def is_accumulating(self):
         return self.accum is not None
@@ -943,14 +773,11 @@
         cloned.accum = self.accum
         return cloned
 
-
 class Pair(Pack):
     """ A special Pack object with only two statements. """
     def __init__(self, left, right):
         assert isinstance(left, Node)
         assert isinstance(right, Node)
-        self.left = left
-        self.right = right
         Pack.__init__(self, [left, right])
 
     def __eq__(self, other):
@@ -960,246 +787,28 @@
 
 class AccumPair(Pair):
     """ A pair that keeps track of an accumulation value """
-    def __init__(self, left, right, input_type, output_type, accum):
+    def __init__(self, left, right, accum):
         assert isinstance(left, Node)
         assert isinstance(right, Node)
-        Pair.__init__(self, left, right, input_type, output_type)
-        self.left = left
-        self.right = right
+        Pair.__init__(self, left, right)
         self.accum = accum
 
-#class OpToVectorOp(object):
-#    def __init__(self): #, restrictargs, typeoutput):
-#        pass
-#        #self.args = list(restrictargs) # do not use a tuple. rpython cannot union
-#        #self.out = typeoutput
+#def extend(self, vbox, newtype):
+#    assert vbox.gettype() == newtype.gettype()
+#    if vbox.gettype() == INT:
+#        return self.extend_int(vbox, newtype)
+#    else:
+#        raise NotImplementedError("cannot yet extend float")
 #
-#class OpToVectorOpConv(OpToVectorOp):
-#    def __init__(self, intype, outtype):
-#        #self.from_size = intype.getsize()
-#        #self.to_size = outtype.getsize()
-#        #OpToVectorOp.__init__(self, (intype, ), outtype)
-#        pass
-#
-#    def new_result_vector_box(self):
-#        type = self.output_type.gettype()
-#        size = self.to_size
-#        count = self.output_type.getcount()
-#        vec_reg_size = self.sched_data.vec_reg_size
-#        if count * size > vec_reg_size:
-#            count = vec_reg_size // size
-#        signed = self.output_type.signed
-#        assert type in ('i','f')
-#        assert size > 0
-#        assert count > 1
-#        return BoxVector(type, count, size, signed)
-#
-#    def get_output_type_given(self, input_type, op):
-#        return self.result_ptype
-#
-#    def get_input_type_given(self, output_type, op):
-#        return self.arg_ptypes[0]
-#
-#    def force_input(self, ptype):
-#        return self.arg_ptypes[0]
-#
-#class SignExtToVectorOp(OpToVectorOp):
-#    def __init__(self, intype, outtype):
-#        OpToVectorOp.__init__(self, intype, outtype)
-#        self.size = -1
-#
-#    def before_argument_transform(self, args):
-#        sizearg = args[1]
-#        assert isinstance(sizearg, ConstInt)
-#        self.size = sizearg.value
-#
-#    def new_result_vector_box(self):
-#        type = self.output_type.gettype()
-#        count = self.input_type.getcount()
-#        vec_reg_size = self.sched_data.vec_reg_size
-#        if count * self.size > vec_reg_size:
-#            count = vec_reg_size // self.size
-#        signed = self.input_type.signed
-#        assert type in ('i','f')
-#        assert self.size > 0
-#        assert count > 1
-#        return BoxVector(type, count, self.size, signed)
-#
-#    def get_output_type_given(self, input_type, op):
-#        sizearg = op.getarg(1)
-#        assert isinstance(sizearg, ConstInt)
-#        output_type = input_type.clone()
-#        output_type.setsize(sizearg.value)
-#        return output_type
-#
-#    def get_input_type_given(self, output_type, op):
-#        raise AssertionError("can never infer input type!")
-#
-#class LoadToVectorLoad(OpToVectorOp):
-#    def __init__(self):
-#        OpToVectorOp.__init__(self, (), TypeRestrict())
-#
-#    # OLD def before_argument_transform(self, args):
-#        #count = min(self.output_type.getcount(), len(self.getoperations()))
-#        #args.append(ConstInt(count))
-#
-#    def get_output_type_given(self, input_type, op):
-#        return xxx#Type.by_descr(op.getdescr(), self.sched_data.vec_reg_size)
-#
-#    def get_input_type_given(self, output_type, op):
-#        return None
-#
-#class StoreToVectorStore(OpToVectorOp):
-#    """ Storing operations are special because they are not allowed
-#        to store to memory if the vector is not fully filled.
-#        Thus a modified split_pack function.
-#    """
-#    def __init__(self):
-#        OpToVectorOp.__init__(self, (None, None, TypeRestrict()), None)
-#        self.has_descr = True
-#
-#    def must_be_full_but_is_not(self, pack):
-#        vrs = self.sched_data.vec_reg_size
-#        it = pack.input_type
-#        return it.getsize() * it.getcount() < vrs
-#
-#    def get_output_type_given(self, input_type, op):
-#        return None
-#
-#    def get_input_type_given(self, output_type, op):
-#        return xxx#Type.by_descr(op.getdescr(), self.sched_data.vec_reg_size)
-#
-#class PassThroughOp(OpToVectorOp):
-#    """ This pass through is only applicable if the target
-#        operation is capable of handling vector operations.
-#        Guard true/false is such an example.
-#    """
-#    def __init__(self, args):
-#        OpToVectorOp.__init__(self, args, None)
-#
-#    def get_output_type_given(self, input_type, op):
-#        return None
-#
-#    def get_input_type_given(self, output_type, op):
-#        raise AssertionError("cannot infer input type from output type")
-#
-#
-#
-##def determine_input_output_types(pack, node, forward):
-##    """ This function is two fold. If moving forward, it
-##        gets an input type from the packs output type and returns
-##        the transformed packtype.
-##
-##        Moving backward, the origins pack input type is the output
-##        type and the transformation of the packtype (in reverse direction)
-##        is the input
-##    """
-##    op = node.getoperation()
-##    op2vecop = determine_trans(op)
-##    if forward:
-##        input_type = op2vecop.force_input(pack.output_type)
-##        output_type = op2vecop.get_output_type_given(input_type, op)
-##        if output_type:
-##            output_type = output_type.clone()
-##    else:
-##        # going backwards, things are not that easy anymore
-##        output_type = pack.input_type
-##        input_type = op2vecop.get_input_type_given(output_type, op)
-##        if input_type:
-##            input_type = input_type.clone()
-##
-##    return input_type, output_type
-#
-#def determine_trans(op):
-#    op2vecop = trans.MAPPING.get(op.vector, None)
-#    if op2vecop is None:
-#        raise NotImplementedError("missing vecop for '%s'" % (op.getopname(),))
-#    return op2vecop
+#def extend_int(self, vbox, newtype):
+#    vbox_cloned = newtype.new_vector_box(vbox.getcount())
+#    self.sched_data._prevent_signext(newtype.getsize(), vbox.getsize())
+#    newsize = newtype.getsize()
+#    assert newsize > 0
+#    op = ResOperation(rop.VEC_INT_SIGNEXT, 
+#                      [vbox, ConstInt(newsize)],
+#                      vbox_cloned)
+#    self.costmodel.record_cast_int(vbox.getsize(), newtype.getsize(), vbox.getcount())
+#    self.vecops.append(op)
+#    return vbox_cloned
 
-
-#def before_argument_transform(self, args):
-#    pass
-
-#def transform_result(self, result):
-#    if result is None:
-#        return None
-#    vbox = self.new_result_vector_box()
-#    #
-#    # mark the position and the vbox in the hash
-#    for i, node in enumerate(self.getoperations()):
-#        if i >= vbox.getcount():
-#            break
-#        op = node.getoperation()
-#        self.sched_data.setvector_of_box(op, i, vbox)
-#    return vbox
-
-#def new_result_vector_box(self):
-#    type = self.output_type.gettype()
-#    size = self.output_type.getsize()
-#    count = min(self.output_type.getcount(), len(self.pack.operations))
-#    signed = self.output_type.signed
-#    return BoxVector(type, count, size, signed)
-
-#def getoperations(self):
-#    return self.pack.operations
-
-#def transform_arguments(self, args):
-#    """ Transforming one argument to a vector box argument
-#        The following cases can occur:
-#        1) argument is present in the box_to_vbox map.
-#           a) vector can be reused immediatly (simple case)
-#           b) vector is to big
-#           c) vector is to small
-#        2) argument is not known to reside in a vector
-#           a) expand vars/consts before the label and add as argument
-#           b) expand vars created in the loop body
-#    """
-#    for i,arg in enumerate(args):
-#        if arg.returns_vector():
-#            continue
-#        if not self.is_vector_arg(i):
-#            continue
-#        box_pos, vbox = self.sched_data.getvector_of_box(arg)
-#        if not vbox:
-#            # constant/variable expand this box
-#            vbox = self.expand(arg, i)
-#            self.sched_data.setvector_of_box(arg, 0, vbox)
-#            box_pos = 0
-#        # convert size i64 -> i32, i32 -> i64, ...
-#        if self.input_type.getsize() > 0 and \
-#           self.input_type.getsize() != vbox.getsize():
-#            vbox = self.extend(vbox, self.input_type)
-
-#        # use the input as an indicator for the pack type
-#        packable = self.input_type.getcount()
-#        packed = vbox.getcount()
-#        assert packed >= 0
-#        assert packable >= 0
-#        if packed > packable:
-#            # the argument has more items than the operation is able to process!
-#            # box_pos == 0 then it is already at the right place
-#            if box_pos != 0:
-#                args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type)
-#                remember_args_in_vector(i, args[i])
-#                #self.update_input_output(self.pack)
-#                continue
-#            else:
-#                assert vbox is not None
-#                args[i] = vbox
-#                continue
-#        vboxes = self.vector_boxes_for_args(i)
-#        if packed < packable and len(vboxes) > 1:
-#            # the argument is scattered along different vector boxes
-#            args[i] = self.gather(vboxes, packable)
-#            remember_args_in_vector(i, args[i])
-#            continue
-#        if box_pos != 0:
-#            # The vector box is at a position != 0 but it
-#            # is required to be at position 0. Unpack it!
-#            args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type)
-#            remember_args_in_vector(i, args[i])
-#            continue
-#            #self.update_input_output(self.pack)
-#        #
-#        assert vbox is not None
-#        args[i] = vbox
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_costmodel.py b/rpython/jit/metainterp/optimizeopt/test/test_costmodel.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_costmodel.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_costmodel.py
@@ -2,14 +2,15 @@
 
 from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop
 from rpython.jit.metainterp.optimizeopt.util import equaloplists
-from rpython.jit.metainterp.optimizeopt.vectorize import (VecScheduleData,
-        Pack, NotAProfitableLoop, VectorizingOptimizer)
+from rpython.jit.metainterp.optimizeopt.vector import (Pack, X86_CostModel,
+        NotAProfitableLoop, VectorizingOptimizer)
+from rpython.jit.metainterp.optimizeopt.schedule import VecScheduleState
 from rpython.jit.metainterp.optimizeopt.dependency import Node, DependencyGraph
 from rpython.jit.metainterp.optimizeopt.test.test_util import LLtypeMixin
 from rpython.jit.metainterp.optimizeopt.test.test_schedule import SchedulerBaseTest
-from rpython.jit.metainterp.optimizeopt.test.test_vectorize import (FakeMetaInterpStaticData,
+from rpython.jit.metainterp.optimizeopt.test.test_vecopt import (FakeMetaInterpStaticData,
         FakeJitDriverStaticData)
-from rpython.jit.metainterp.resoperation import rop, ResOperation
+from rpython.jit.metainterp.resoperation import rop, ResOperation, AbstractValue
 from rpython.jit.tool.oparser import parse as opparse
 from rpython.jit.tool.oparser_model import get_model
 
@@ -18,7 +19,7 @@
         self.index_var = iv
         self.array = array
 
-    def is_adjacent_to(self, other):
+    def is_adjacent_after(self, other):
         if self.array is not other.array:
             return False
         iv = self.index_var
@@ -28,36 +29,39 @@
         # i1 and i0 ...
         # but not i0, i2
         # ...
-        return abs(val) == 1
+        print iv, 'is after', ov, "?", val == 1
+        return val == 1
 
 class CostModelBaseTest(SchedulerBaseTest):
+
     def savings(self, loop):
         metainterp_sd = FakeMetaInterpStaticData(self.cpu)
         jitdriver_sd = FakeJitDriverStaticData()
-        opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, [])
-        label_index = loop.find_first_index(rop.LABEL)
-        opt.orig_label_args = loop.operations[label_index].getarglist()[:]
+        opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, 0)
+        opt.orig_label_args = loop.label.getarglist()[:]
         graph = opt.dependency_graph = DependencyGraph(loop)
+        self.show_dot_graph(graph, 'costmodel')
         for k,m in graph.memory_refs.items():
             graph.memory_refs[k] = FakeMemoryRef(m.array, m.index_var)
-        opt.find_adjacent_memory_refs()
+        opt.find_adjacent_memory_refs(graph)
         opt.extend_packset()
         opt.combine_packset()
         for pack in opt.packset.packs:
             print "pack: \n   ",
             print '\n    '.join([str(op.getoperation()) for op in pack.operations])
             print
-        opt.costmodel.reset_savings()
-        opt.schedule(True)
-        return opt.costmodel.savings
+        costmodel = X86_CostModel(self.cpu, 0)
+        state = VecScheduleState(graph, opt.packset, self.cpu, costmodel)
+        opt.schedule(state)
+        return costmodel.savings
 
     def assert_operations_match(self, loop_a, loop_b):
         assert equaloplists(loop_a.operations, loop_b.operations)
 
     def test_load_2_unpack(self):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         guard_true(i0) [f10]
         guard_true(i1) [f11]
         """)
@@ -68,11 +72,11 @@
         assert savings == -2
 
     def test_load_4_unpack(self):
-        loop1 = self.parse("""
-        i10 = raw_load(p0, i0, descr=float)
-        i11 = raw_load(p0, i1, descr=float)
-        i12 = raw_load(p0, i2, descr=float)
-        i13 = raw_load(p0, i3, descr=float)
+        loop1 = self.parse_trace("""
+        i10 = raw_load_i(p0, i0, descr=float)
+        i11 = raw_load_i(p0, i1, descr=float)
+        i12 = raw_load_i(p0, i2, descr=float)
+        i13 = raw_load_i(p0, i3, descr=float)
         guard_true(i0) [i10]
         guard_true(i1) [i11]
         guard_true(i2) [i12]
@@ -82,29 +86,29 @@
         assert savings == -1
 
     def test_load_2_unpack_1(self):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         guard_true(i0) [f10]
         """)
         savings = self.savings(loop1)
         assert savings == 0
 
     def test_load_2_unpack_1_index1(self):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         guard_true(i0) [f11]
         """)
         savings = self.savings(loop1)
         assert savings == -1
 
-    def test_load_arith(self):
-        loop1 = self.parse("""
-        i10 = raw_load(p0, i0, descr=int)
-        i11 = raw_load(p0, i1, descr=int)
-        i12 = raw_load(p0, i2, descr=int)
-        i13 = raw_load(p0, i3, descr=int)
+    def test_load_arith1(self):
+        loop1 = self.parse_trace("""
+        i10 = raw_load_i(p0, i0, descr=int)
+        i11 = raw_load_i(p0, i1, descr=int)
+        i12 = raw_load_i(p0, i2, descr=int)
+        i13 = raw_load_i(p0, i3, descr=int)
         i15 = int_add(i10, 1)
         i16 = int_add(i11, 1)
         i17 = int_add(i12, 1)
@@ -114,9 +118,9 @@
         assert savings == 6
 
     def test_load_arith_store(self):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         i20 = cast_float_to_int(f10)
         i21 = cast_float_to_int(f11)
         i30 = int_signext(i20, 4)
@@ -128,9 +132,9 @@
         assert savings >= 0
 
     def test_sum(self):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         f12 = float_add(f1, f10)
         f13 = float_add(f12, f11)
         """)
@@ -139,9 +143,9 @@
 
     @py.test.mark.parametrize("bytes,s", [(1,None),(2,None),(4,0),(8,0)])
     def test_sum_float_to_int(self, bytes, s):
-        loop1 = self.parse("""
-        f10 = raw_load(p0, i0, descr=double)
-        f11 = raw_load(p0, i1, descr=double)
+        loop1 = self.parse_trace("""
+        f10 = raw_load_f(p0, i0, descr=double)
+        f11 = raw_load_f(p0, i1, descr=double)
         i10 = cast_float_to_int(f10)
         i11 = cast_float_to_int(f11)
         i12 = int_signext(i10, {c})
@@ -166,20 +170,20 @@
                 py.test.fail("must not fail")
 
     def test_cast(self):
-        loop1 = self.parse("""
-        i100 = raw_load(p0, i1, descr=float)
-        i101 = raw_load(p0, i2, descr=float)
-        i102 = raw_load(p0, i3, descr=float)
-        i103 = raw_load(p0, i4, descr=float)
+        loop1 = self.parse_trace("""
+        i100 = raw_load_i(p0, i1, descr=float)
+        i101 = raw_load_i(p0, i2, descr=float)
+        i102 = raw_load_i(p0, i3, descr=float)
+        i103 = raw_load_i(p0, i4, descr=float)
         #
-        i104 = raw_load(p1, i1, descr=short)
-        i105 = raw_load(p1, i2, descr=short)
-        i106 = raw_load(p1, i3, descr=short)
-        i107 = raw_load(p1, i4, descr=short)
-        i108 = raw_load(p1, i5, descr=short)
-        i109 = raw_load(p1, i6, descr=short)
-        i110 = raw_load(p1, i7, descr=short)
-        i111 = raw_load(p1, i8, descr=short)
+        i104 = raw_load_i(p1, i1, descr=short)
+        i105 = raw_load_i(p1, i2, descr=short)
+        i106 = raw_load_i(p1, i3, descr=short)
+        i107 = raw_load_i(p1, i4, descr=short)
+        i108 = raw_load_i(p1, i5, descr=short)
+        i109 = raw_load_i(p1, i6, descr=short)
+        i110 = raw_load_i(p1, i7, descr=short)
+        i111 = raw_load_i(p1, i8, descr=short)
         #
         f100 = cast_int_to_float(i104)
         f101 = cast_int_to_float(i105)
@@ -192,7 +196,7 @@
         """)
         try:
             self.savings(loop1)
-            py.test.fail("must not profitable!")
+            py.test.fail("must not be profitable!")
         except NotAProfitableLoop:
             pass
 
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
@@ -6,6 +6,7 @@
 from rpython.jit.metainterp.history import TargetToken, JitCellToken, TreeLoop
 from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph, Dependency,
         IndexVar, MemoryRef, Node)
+from rpython.jit.metainterp.compile import ResumeAtLoopHeaderDescr
 from rpython.jit.metainterp.optimizeopt.vector import VectorLoop
 from rpython.jit.metainterp.resoperation import rop, ResOperation
 from rpython.jit.backend.llgraph.runner import ArrayDescr
@@ -54,7 +55,7 @@
         loop.jump.setdescr(token)
         for op in loop.operations:
             if op.getopnum() == rop.GUARD_EARLY_EXIT and op.getdescr() is None:
-                op.setdescr(compile.ResumeAtLoopHeaderDescr())
+                op.setdescr(ResumeAtLoopHeaderDescr())
         return loop
 
     def assert_edges(self, graph, edge_list, exceptions):
diff --git a/rpython/jit/metainterp/optimizeopt/vector.py b/rpython/jit/metainterp/optimizeopt/vector.py
--- a/rpython/jit/metainterp/optimizeopt/vector.py
+++ b/rpython/jit/metainterp/optimizeopt/vector.py
@@ -29,6 +29,8 @@
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.rlib.jit import Counters
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.jit.backend.llsupport.symbolic import (WORD as INT_WORD,
+        SIZEOF_FLOAT as FLOAT_WORD)
 
 class VectorLoop(object):
     def __init__(self, label, oplist, jump):
@@ -188,7 +190,7 @@
 
         # vectorize
         graph = DependencyGraph(loop)
-        self.find_adjacent_memory_refs()
+        self.find_adjacent_memory_refs(graph)
         self.extend_packset()
         self.combine_packset()
         # TODO move cost model to CPU
@@ -256,7 +258,7 @@
                 if op.getopnum() in prohibit_opnums:
                     continue # do not unroll this operation twice
                 copied_op = op.clone()
-                if copied_op.result is not None:
+                if not copied_op.returns_void():
                     # every result assigns a new box, thus creates an entry
                     # to the rename map.
                     new_assigned_box = copied_op.result.clonebox()
@@ -323,7 +325,7 @@
             They are represented as a linear combination: i*c/d + e, i is a variable,
             all others are integers that are calculated in reverse direction
         """
-        loop = self.loop
+        loop = graph.loop
         operations = loop.operations
 
         self.packset = PackSet(self.cpu.vector_register_size)
@@ -338,8 +340,10 @@
                 # exclue a_opidx == b_opidx only consider the ones
                 # that point forward:
                 if memref_a.is_adjacent_after(memref_b):
+                    print node_a.getindex(), "is after", node_b.getindex()
                     pair = self.packset.can_be_packed(node_a, node_b, None, False)
                     if pair:
+                        print "creating mem pair", pair
                         self.packset.add_pack(pair)
 
     def extend_packset(self):
@@ -348,26 +352,33 @@
         """
         pack_count = self.packset.pack_count()
         while True:
-            for pack in self.packset.packs:
+            i = 0
+            packs = self.packset.packs
+            while i < len(packs):
+                pack = packs[i]
                 self.follow_def_uses(pack)
+                i += 1
             if pack_count == self.packset.pack_count():
                 pack_count = self.packset.pack_count()
-                for pack in self.packset.packs:
+                i = 0
+                while i < len(packs):
+                    pack = packs[i]
                     self.follow_use_defs(pack)
+                    i += 1
                 if pack_count == self.packset.pack_count():
                     break
             pack_count = self.packset.pack_count()
 
     def follow_use_defs(self, pack):
         assert isinstance(pack, Pair)
-        for ldep in pack.left.depends():
-            for rdep in pack.right.depends():
+        for ldep in pack.leftmost(True).depends():
+            for rdep in pack.rightmost(True).depends():
                 lnode = ldep.to
                 rnode = rdep.to
-                # only valid if the result of the left is in args of pack left
-                result = lnode.getoperation().result
-                args = pack.left.getoperation().getarglist()
-                if result is None or result not in args:
+                # only valid if left is in args of pack left
+                left = lnode.getoperation()
+                args = pack.leftmost().getarglist()
+                if left is None or left not in args:
                     continue
                 isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
                 if isomorph and lnode.is_before(rnode):
@@ -377,19 +388,25 @@
 
     def follow_def_uses(self, pack):
         assert isinstance(pack, Pair)
-        for ldep in pack.left.provides():
-            for rdep in pack.right.provides():
+        print "lprov", pack.leftmost(node=True).provides_count(),
+        print "rprov", pack.rightmost(node=True).provides_count()
+        for ldep in pack.leftmost(node=True).provides():
+            for rdep in pack.rightmost(node=True).provides():
                 lnode = ldep.to
                 rnode = rdep.to
-                result = pack.left.getoperation().result
+                print "trying", lnode.getindex(), rnode.getindex(), lnode, rnode
+                left = pack.leftmost()
                 args = lnode.getoperation().getarglist()
-                if result is None or result not in args:
+                if left is None or left not in args:
                     continue
                 isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
                 if isomorph and lnode.is_before(rnode):
                     pair = self.packset.can_be_packed(lnode, rnode, pack, True)
                     if pair:
+                        print "creating pair" , pair, pair.operations[0].op, pair.operations[1].op
                         self.packset.add_pack(pair)
+                    else:
+                        print "!!!creating pair" , lnode, rnode
 
     def combine_packset(self):
         """ Combination is done iterating the packs that have
@@ -404,7 +421,6 @@
         i = 0
         j = 0
         end_ij = len(self.packset.packs)
-        orphan = {}
         while True:
             len_before = len(self.packset.packs)
             i = 0
@@ -616,6 +632,7 @@
             cost, benefit_factor = self.cb_signext(pack)
         #
         self.savings += benefit_factor * times - cost
+        print "$$$ recording", benefit_factor, "*", times, "-", cost, "now:", self.savings
 
     def cb_signext(self, pack):
         left = pack.leftmost()
@@ -627,13 +644,16 @@
     def record_cast_int(self, fromsize, tosize, count):
         # for each move there is 1 instruction
         self.savings += -count
+        print "$$$ cast", -count, "now", self.savings
 
     def record_vector_pack(self, src, index, count):
         if src.datatype == FLOAT:
             if index == 1 and count == 1:
                 self.savings -= 2
+                print "$$$ vector pack -2 now:", self.savings
                 return
         self.savings -= count
+        print "$$$ vector pack ", count, "now", self.savings
 
     def record_vector_unpack(self, src, index, count):
         self.record_vector_pack(src, index, count)
@@ -680,6 +700,7 @@
                 if self.profitable_pack(lnode, rnode, origin_pack, forward):
                     return Pair(lnode, rnode)
             else:
+                print "dependent"
                 if self.contains_pair(lnode, rnode):
                     return None
                 if origin_pack is not None:
@@ -688,24 +709,18 @@
 
     def contains_pair(self, lnode, rnode):
         for pack in self.packs:
-            if pack.left is lnode or pack.right is rnode:
+            if pack.leftmost(node=True) is lnode or \
+               pack.rightmost(node=True) is rnode:
                 return True
         return False
 
     def profitable_pack(self, lnode, rnode, origin_pack, forward):
-        lpacknode = origin_pack.left
-        if self.prohibit_packing(origin_pack,
-                                 lpacknode.getoperation(),
-                                 lnode.getoperation(),
-                                 forward):
+        if self.prohibit_packing(origin_pack, origin_pack.leftmost(),
+                                 lnode.getoperation(), forward):
             return False
-        rpacknode = origin_pack.right
-        if self.prohibit_packing(origin_pack,
-                                 rpacknode.getoperation(),
-                                 rnode.getoperation(),
-                                 forward):
+        if self.prohibit_packing(origin_pack, origin_pack.rightmost(),
+                                 rnode.getoperation(), forward):
             return False
-
         return True
 
     def prohibit_packing(self, pack, packed, inquestion, forward):
@@ -713,7 +728,7 @@
         if inquestion.vector == -1:
             return True
         if packed.is_primitive_array_access():
-            if packed.getarg(1) == inquestion.result:
+            if packed.getarg(1) is inquestion:
                 return True
         if not forward and inquestion.getopnum() == rop.INT_SIGNEXT:
             # prohibit the packing of signext in backwards direction
@@ -742,37 +757,37 @@
     def accumulates_pair(self, lnode, rnode, origin_pack):
         # lnode and rnode are isomorphic and dependent
         assert isinstance(origin_pack, Pair)
-        lop = lnode.getoperation()
-        opnum = lop.getopnum()
+        left = lnode.getoperation()
+        opnum = left.getopnum()
 
         if opnum in (rop.FLOAT_ADD, rop.INT_ADD, rop.FLOAT_MUL):
-            roper = rnode.getoperation()
-            assert lop.numargs() == 2 and lop.result is not None
-            accum_var, accum_pos = self.getaccumulator_variable(lop, roper, origin_pack)
+            right = rnode.getoperation()
+            assert left.numargs() == 2 and not left.returns_void()
+            accum_var, accum_pos = self.getaccumulator_variable(left, right, origin_pack)
             if not accum_var:
                 return None
-            # the dependency exists only because of the result of lnode
+            # the dependency exists only because of the left?
             for dep in lnode.provides():
                 if dep.to is rnode:
                     if not dep.because_of(accum_var):
                         # not quite ... this is not handlable
                         return None
             # get the original variable
-            accum_var = lop.getarg(accum_pos)
+            accum_var = left.getarg(accum_pos)
 
             # in either of the two cases the arguments are mixed,
             # which is not handled currently
             var_pos = (accum_pos + 1) % 2
-            plop = origin_pack.left.getoperation()
-            if lop.getarg(var_pos) is not plop.result:
+            if left.getarg(var_pos) is not origin_pack.leftmost():
                 return None
-            prop = origin_pack.right.getoperation()
-            if roper.getarg(var_pos) is not prop.result:
+            if right.getarg(var_pos) is not origin_pack.rightmost():
                 return None
 
             # this can be handled by accumulation
-            ptype = origin_pack.output_type
-            if ptype.getsize() != 8:
+            size = INT_WORD
+            if left.type == 'f':
+                size = FLOAT_WORD
+            if left.bytesize == right.bytesize and left.bytesize == size:
                 # do not support if if the type size is smaller
                 # than the cpu word size.
                 # WHY?
@@ -781,16 +796,14 @@
                 # considered. => tree pattern matching problem.
                 return None
             accum = Accum(opnum, accum_var, accum_pos)
-            return AccumPair(lnode, rnode, ptype, ptype, accum)
+            return AccumPair(lnode, rnode, accum)
 
         return None
 
-    def getaccumulator_variable(self, lop, rop, origin_pack):
-        args = rop.getarglist()
-        for i, arg in enumerate(args):
-            if arg is lop.result:
+    def getaccumulator_variable(self, left, right, origin_pack):
+        for i, arg in enumerate(right.getarglist()):
+            if arg is left:
                 return arg, i
-        #
         return None, -1
 
     def accumulate_prepare(self, state):
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -147,6 +147,11 @@
                 i += 1
                 arg = self.getarg(i)
             if arg.is_constant():
+                if arg.type == 'i':
+                    self.setdatatype('i', INT_WORD, True)
+                else:
+                    assert arg.type == 'f'
+                    self.setdatatype('f', FLOAT_WORD, False)
                 return
             self.setdatatype(arg.datatype, arg.bytesize, arg.signed)
         assert self.datatype != '\x00'
diff --git a/rpython/jit/tool/oparser.py b/rpython/jit/tool/oparser.py
--- a/rpython/jit/tool/oparser.py
+++ b/rpython/jit/tool/oparser.py
@@ -198,6 +198,8 @@
             from rpython.rtyper.lltypesystem import lltype, llmemory
             assert elem.startswith('p')
             v = InputArgRef(lltype.nullptr(llmemory.GCREF.TO))
+        # ensure that the variable gets the proper naming
+        self.update_memo(v, elem)
         self.vars[elem] = v
         return v
 
@@ -353,9 +355,24 @@
             raise ParseError("Double assign to var %s in line: %s" % (res, line))
         resop = self.create_op(opnum, args, res, descr, fail_args)
         res = self.update_vector(resop, res)
+        self.update_memo(resop, res)
         self.vars[res] = resop
         return resop
 
+    def update_memo(self, val, name):
+        """ This updates the id of the operation or inputarg.
+            Internally you will see the same variable names as
+            in the trace as string.
+        """
+        regex = re.compile("[prif](\d+)")
+        match = regex.match(name)
+        if match:
+            counter = int(match.group(1))
+            countdict = val._repr_memo
+            countdict._d[val] = counter
+            if countdict.counter < counter:
+                countdict.counter = counter
+
     def update_vector(self, resop, var):
         pattern = re.compile('.*\[(\d+)x(u?)(i|f)(\d+)\]')
         match = pattern.match(var)