[pypy-commit] pypy ppc-vsx-support: copy paste vector_ext file to provide a skeleton to ppc,

Tue Jun 14 05:42:38 EDT 2016

Author: Richard Plangger <planrichi at gmail.com>
Branch: ppc-vsx-support
Changeset: r85156:2cef040e9515
Date: 2016-06-13 17:26 +0200
http://bitbucket.org/pypy/pypy/changeset/2cef040e9515/

Log:	copy paste vector_ext file to provide a skeleton to ppc, update vec
	argument to be turned on by default now updated doc of the option

diff --git a/rpython/jit/backend/ppc/vector_ext.py b/rpython/jit/backend/ppc/vector_ext.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -0,0 +1,725 @@
+import py
+from rpython.jit.metainterp.compile import ResumeGuardDescr
+from rpython.jit.metainterp.history import (ConstInt, INT, REF,
+    FLOAT, VECTOR, TargetToken)
+from rpython.jit.backend.llsupport.descr import (ArrayDescr, CallDescr,
+    unpack_arraydescr, unpack_fielddescr, unpack_interiorfielddescr)
+from rpython.jit.backend.llsupport.regalloc import get_scale
+from rpython.jit.metainterp.resoperation import (rop, ResOperation,
+        VectorOp, VectorGuardOp)
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.lltypesystem import lltype
+
+def not_implemented(msg):
+    msg = '[ppc/vector_ext] %s\n' % msg
+    if we_are_translated():
+        llop.debug_print(lltype.Void, msg)
+    raise NotImplementedError(msg)
+
+class PPCVectorAssemblerMixin(object):
+    _mixin_ = True
+
+    #def genop_guard_vec_guard_true(self, guard_op, guard_token, locs, resloc):
+    #    self.implement_guard(guard_token)
+
+    #def genop_guard_vec_guard_false(self, guard_op, guard_token, locs, resloc):
+    #    self.guard_success_cc = rx86.invert_condition(self.guard_success_cc)
+    #    self.implement_guard(guard_token)
+
+    #def guard_vector(self, guard_op, loc, true):
+    #    assert isinstance(guard_op, VectorGuardOp)
+    #    arg = guard_op.getarg(0)
+    #    assert isinstance(arg, VectorOp)
+    #    size = arg.bytesize
+    #    temp = X86_64_XMM_SCRATCH_REG
+    #    load = arg.bytesize * arg.count - self.cpu.vector_register_size
+    #    assert load <= 0
+    #    if true:
+    #        self.mc.PXOR(temp, temp)
+    #        # if the vector is not fully packed blend 1s
+    #        if load < 0:
+    #            self.mc.PCMPEQQ(temp, temp) # fill with ones
+    #            self._blend_unused_slots(loc, arg, temp)
+    #            # reset to zeros
+    #            self.mc.PXOR(temp, temp)
+
+    #        # cmp with zeros (in temp) creates ones at each slot where it is zero
+    #        self.mc.PCMPEQ(loc, temp, size)
+    #        # temp converted to ones
+    #        self.mc.PCMPEQQ(temp, temp)
+    #        # test if all slots are zero
+    #        self.mc.PTEST(loc, temp)
+    #        self.guard_success_cc = rx86.Conditions['Z']
+    #    else:
+    #        # if the vector is not fully packed blend 1s
+    #        if load < 0:
+    #            temp = X86_64_XMM_SCRATCH_REG
+    #            self.mc.PXOR(temp, temp)
+    #            self._blend_unused_slots(loc, arg, temp)
+    #        self.mc.PTEST(loc, loc)
+    #        self.guard_success_cc = rx86.Conditions['NZ']
+
+    #def _blend_unused_slots(self, loc, arg, temp):
+    #    select = 0
+    #    bits_used = (arg.count * arg.bytesize * 8)
+    #    index = bits_used // 16
+    #    while index < 8:
+    #        select |= (1 << index)
+    #        index += 1
+    #    self.mc.PBLENDW_xxi(loc.value, temp.value, select)
+
+    #def _update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
+    #    """ If accumulation is done in this loop, at the guard exit
+    #        some vector registers must be adjusted to yield the correct value
+    #    """
+    #    if not isinstance(faildescr, ResumeGuardDescr):
+    #        return
+    #    assert regalloc is not None
+    #    accum_info = faildescr.rd_vector_info
+    #    while accum_info:
+    #        pos = accum_info.getpos_in_failargs()
+    #        scalar_loc = fail_locs[pos]
+    #        vector_loc = accum_info.location
+    #        # the upper elements will be lost if saved to the stack!
+    #        scalar_arg = accum_info.getoriginal()
+    #        assert isinstance(vector_loc, RegLoc)
+    #        if not isinstance(scalar_loc, RegLoc):
+    #            scalar_loc = regalloc.force_allocate_reg(scalar_arg)
+    #        assert scalar_arg is not None
+    #        if accum_info.accum_operation == '+':
+    #            self._accum_reduce_sum(scalar_arg, vector_loc, scalar_loc)
+    #        elif accum_info.accum_operation == '*':
+    #            self._accum_reduce_mul(scalar_arg, vector_loc, scalar_loc)
+    #        else:
+    #            not_implemented("accum operator %s not implemented" %
+    #                                        (accum_info.accum_operation)) 
+    #        accum_info = accum_info.next()
+
+    #def _accum_reduce_mul(self, arg, accumloc, targetloc):
+    #    scratchloc = X86_64_XMM_SCRATCH_REG
+    #    self.mov(accumloc, scratchloc)
+    #    # swap the two elements
+    #    self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
+    #    self.mc.MULSD(accumloc, scratchloc)
+    #    if accumloc is not targetloc:
+    #        self.mov(accumloc, targetloc)
+
+    #def _accum_reduce_sum(self, arg, accumloc, targetloc):
+    #    # Currently the accumulator can ONLY be the biggest
+    #    # size for X86 -> 64 bit float/int
+    #    if arg.type == FLOAT:
+    #        # r = (r[0]+r[1],r[0]+r[1])
+    #        self.mc.HADDPD(accumloc, accumloc)
+    #        # upper bits (> 64) are dirty (but does not matter)
+    #        if accumloc is not targetloc:
+    #            self.mov(accumloc, targetloc)
+    #        return
+    #    elif arg.type == INT:
+    #        scratchloc = X86_64_SCRATCH_REG
+    #        self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
+    #        self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
+    #        self.mc.ADD(targetloc, scratchloc)
+    #        return
+
+    #    not_implemented("reduce sum for %s not impl." % arg)
+
+    #def _genop_vec_getarrayitem(self, op, arglocs, resloc):
+    #    # considers item scale (raw_load does not)
+    #    base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+    #    scale = get_scale(size_loc.value)
+    #    src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
+    #    self._vec_load(resloc, src_addr, integer_loc.value,
+    #                   size_loc.value, aligned_loc.value)
+    #
+    #genop_vec_getarrayitem_raw_i = _genop_vec_getarrayitem
+    #genop_vec_getarrayitem_raw_f = _genop_vec_getarrayitem
+    #
+    #genop_vec_getarrayitem_gc_i = _genop_vec_getarrayitem
+    #genop_vec_getarrayitem_gc_f = _genop_vec_getarrayitem
+
+    #def _genop_vec_raw_load(self, op, arglocs, resloc):
+    #    base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+    #    src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
+    #    self._vec_load(resloc, src_addr, integer_loc.value,
+    #                   size_loc.value, aligned_loc.value)
+
+    #genop_vec_raw_load_i = _genop_vec_raw_load
+    #genop_vec_raw_load_f = _genop_vec_raw_load
+
+    #def _vec_load(self, resloc, src_addr, integer, itemsize, aligned):
+    #    if integer:
+    #        if aligned:
+    #            self.mc.MOVDQA(resloc, src_addr)
+    #        else:
+    #            self.mc.MOVDQU(resloc, src_addr)
+    #    else:
+    #        if itemsize == 4:
+    #            self.mc.MOVUPS(resloc, src_addr)
+    #        elif itemsize == 8:
+    #            self.mc.MOVUPD(resloc, src_addr)
+
+    #def _genop_discard_vec_setarrayitem(self, op, arglocs):
+    #    # considers item scale (raw_store does not)
+    #    base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, aligned_loc = arglocs
+    #    scale = get_scale(size_loc.value)
+    #    dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
+    #    self._vec_store(dest_loc, value_loc, integer_loc.value,
+    #                    size_loc.value, aligned_loc.value)
+
+    #genop_discard_vec_setarrayitem_raw = _genop_discard_vec_setarrayitem
+    #genop_discard_vec_setarrayitem_gc = _genop_discard_vec_setarrayitem
+
+    #def genop_discard_vec_raw_store(self, op, arglocs):
+    #    base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, aligned_loc = arglocs
+    #    dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
+    #    self._vec_store(dest_loc, value_loc, integer_loc.value,
+    #                    size_loc.value, aligned_loc.value)
+
+    #def _vec_store(self, dest_loc, value_loc, integer, itemsize, aligned):
+    #    if integer:
+    #        if aligned:
+    #            self.mc.MOVDQA(dest_loc, value_loc)
+    #        else:
+    #            self.mc.MOVDQU(dest_loc, value_loc)
+    #    else:
+    #        if itemsize == 4:
+    #            self.mc.MOVUPS(dest_loc, value_loc)
+    #        elif itemsize == 8:
+    #            self.mc.MOVUPD(dest_loc, value_loc)
+
+    #def genop_vec_int_is_true(self, op, arglocs, resloc):
+    #    loc, sizeloc = arglocs
+    #    temp = X86_64_XMM_SCRATCH_REG
+    #    self.mc.PXOR(temp, temp)
+    #    # every entry that is non zero -> becomes zero
+    #    # zero entries become ones
+    #    self.mc.PCMPEQ(loc, temp, sizeloc.value)
+    #    # a second time -> every zero entry (corresponding to non zero
+    #    # entries before) become ones
+    #    self.mc.PCMPEQ(loc, temp, sizeloc.value)
+
+    #def genop_vec_int_mul(self, op, arglocs, resloc):
+    #    loc0, loc1, itemsize_loc = arglocs
+    #    itemsize = itemsize_loc.value
+    #    if itemsize == 2:
+    #        self.mc.PMULLW(loc0, loc1)
+    #    elif itemsize == 4:
+    #        self.mc.PMULLD(loc0, loc1)
+    #    else:
+    #        # NOTE see http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
+    #        # There is no 64x64 bit packed mul. For 8 bit either. It is questionable if it gives any benefit?
+    #        not_implemented("int8/64 mul")
+
+    #def genop_vec_int_add(self, op, arglocs, resloc):
+    #    loc0, loc1, size_loc = arglocs
+    #    size = size_loc.value
+    #    if size == 1:
+    #        self.mc.PADDB(loc0, loc1)
+    #    elif size == 2:
+    #        self.mc.PADDW(loc0, loc1)
+    #    elif size == 4:
+    #        self.mc.PADDD(loc0, loc1)
+    #    elif size == 8:
+    #        self.mc.PADDQ(loc0, loc1)
+
+    #def genop_vec_int_sub(self, op, arglocs, resloc):
+    #    loc0, loc1, size_loc = arglocs
+    #    size = size_loc.value
+    #    if size == 1:
+    #        self.mc.PSUBB(loc0, loc1)
+    #    elif size == 2:
+    #        self.mc.PSUBW(loc0, loc1)
+    #    elif size == 4:
+    #        self.mc.PSUBD(loc0, loc1)
+    #    elif size == 8:
+    #        self.mc.PSUBQ(loc0, loc1)
+
+    #def genop_vec_int_and(self, op, arglocs, resloc):
+    #    self.mc.PAND(resloc, arglocs[0])
+
+    #def genop_vec_int_or(self, op, arglocs, resloc):
+    #    self.mc.POR(resloc, arglocs[0])
+
+    #def genop_vec_int_xor(self, op, arglocs, resloc):
+    #    self.mc.PXOR(resloc, arglocs[0])
+
+    #genop_vec_float_arith = """
+    #def genop_vec_float_{type}(self, op, arglocs, resloc):
+    #    loc0, loc1, itemsize_loc = arglocs
+    #    itemsize = itemsize_loc.value
+    #    if itemsize == 4:
+    #        self.mc.{p_op_s}(loc0, loc1)
+    #    elif itemsize == 8:
+    #        self.mc.{p_op_d}(loc0, loc1)
+    #"""
+    #for op in ['add','mul','sub']:
+    #    OP = op.upper()
+    #    _source = genop_vec_float_arith.format(type=op,
+    #                                           p_op_s=OP+'PS',
+    #                                           p_op_d=OP+'PD')
+    #    exec py.code.Source(_source).compile()
+    #del genop_vec_float_arith
+
+    #def genop_vec_float_truediv(self, op, arglocs, resloc):
+    #    loc0, loc1, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    if size == 4:
+    #        self.mc.DIVPS(loc0, loc1)
+    #    elif size == 8:
+    #        self.mc.DIVPD(loc0, loc1)
+
+    #def genop_vec_float_abs(self, op, arglocs, resloc):
+    #    src, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    if size == 4:
+    #        self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
+    #    elif size == 8:
+    #        self.mc.ANDPD(src, heap(self.float_const_abs_addr))
+
+    #def genop_vec_float_neg(self, op, arglocs, resloc):
+    #    src, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    if size == 4:
+    #        self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
+    #    elif size == 8:
+    #        self.mc.XORPD(src, heap(self.float_const_neg_addr))
+
+    #def genop_vec_float_eq(self, op, arglocs, resloc):
+    #    _, rhsloc, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    if size == 4:
+    #        self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 0) # 0 means equal
+    #    else:
+    #        self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 0)
+
+    #def genop_vec_float_ne(self, op, arglocs, resloc):
+    #    _, rhsloc, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    # b(100) == 1 << 2 means not equal
+    #    if size == 4:
+    #        self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 1 << 2)
+    #    else:
+    #        self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 1 << 2)
+
+    #def genop_vec_int_eq(self, op, arglocs, resloc):
+    #    _, rhsloc, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    self.mc.PCMPEQ(resloc, rhsloc, size)
+
+    #def genop_vec_int_ne(self, op, arglocs, resloc):
+    #    _, rhsloc, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    self.mc.PCMPEQ(resloc, rhsloc, size)
+    #    temp = X86_64_XMM_SCRATCH_REG
+    #    self.mc.PCMPEQQ(temp, temp) # set all bits to one
+    #    # need to invert the value in resloc
+    #    self.mc.PXOR(resloc, temp)
+    #    # 11 00 11 11
+    #    # 11 11 11 11
+    #    # ----------- pxor
+    #    # 00 11 00 00
+
+    #def genop_vec_int_signext(self, op, arglocs, resloc):
+    #    srcloc, sizeloc, tosizeloc = arglocs
+    #    size = sizeloc.value
+    #    tosize = tosizeloc.value
+    #    if size == tosize:
+    #        return # already the right size
+    #    if size == 4 and tosize == 8:
+    #        scratch = X86_64_SCRATCH_REG.value
+    #        self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
+    #        self.mc.PINSRQ_xri(resloc.value, scratch, 1)
+    #        self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
+    #        self.mc.PINSRQ_xri(resloc.value, scratch, 0)
+    #    elif size == 8 and tosize == 4:
+    #        # is there a better sequence to move them?
+    #        scratch = X86_64_SCRATCH_REG.value
+    #        self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
+    #        self.mc.PINSRD_xri(resloc.value, scratch, 0)
+    #        self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
+    #        self.mc.PINSRD_xri(resloc.value, scratch, 1)
+    #    else:
+    #        # note that all other conversions are not implemented
+    #        # on purpose. it needs many x86 op codes to implement
+    #        # the missing combinations. even if they are implemented
+    #        # the speedup might only be modest...
+    #        # the optimization does not emit such code!
+    #        msg = "vec int signext (%d->%d)" % (size, tosize)
+    #        not_implemented(msg)
+
+    #def genop_vec_expand_f(self, op, arglocs, resloc):
+    #    srcloc, sizeloc = arglocs
+    #    size = sizeloc.value
+    #    if isinstance(srcloc, ConstFloatLoc):
+    #        # they are aligned!
+    #        self.mc.MOVAPD(resloc, srcloc)
+    #    elif size == 4:
+    #        # the register allocator forces src to be the same as resloc
+    #        # r = (s[0], s[0], r[0], r[0])
+    #        # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
+    #        self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
+    #    elif size == 8:
+    #        self.mc.MOVDDUP(resloc, srcloc)
+    #    else:
+    #        raise AssertionError("float of size %d not supported" % (size,))
+
+    #def genop_vec_expand_i(self, op, arglocs, resloc):
+    #    srcloc, sizeloc = arglocs
+    #    if not isinstance(srcloc, RegLoc):
+    #        self.mov(srcloc, X86_64_SCRATCH_REG)
+    #        srcloc = X86_64_SCRATCH_REG
+    #    assert not srcloc.is_xmm
+    #    size = sizeloc.value
+    #    if size == 1:
+    #        self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
+    #        self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
+    #    elif size == 2:
+    #        self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
+    #        self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
+    #        self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
+    #        self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
+    #    elif size == 4:
+    #        self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
+    #        self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
+    #    elif size == 8:
+    #        self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
+    #        self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
+    #    else:
+    #        raise AssertionError("cannot handle size %d (int expand)" % (size,))
+
+    #def genop_vec_pack_i(self, op, arglocs, resloc):
+    #    resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+    #    assert isinstance(resultloc, RegLoc)
+    #    assert isinstance(sourceloc, RegLoc)
+    #    size = sizeloc.value
+    #    srcidx = srcidxloc.value
+    #    residx = residxloc.value
+    #    count = countloc.value
+    #    # for small data type conversion this can be quite costy
+    #    # NOTE there might be some combinations that can be handled
+    #    # more efficiently! e.g.
+    #    # v2 = pack(v0,v1,4,4)
+    #    si = srcidx
+    #    ri = residx
+    #    k = count
+    #    while k > 0:
+    #        if size == 8:
+    #            if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
+    #                self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si)
+    #                self.mc.PINSRQ_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri)
+    #            elif resultloc.is_xmm: # xmm <- reg
+    #                self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
+    #            else: # reg <- xmm
+    #                self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
+    #        elif size == 4:
+    #            if resultloc.is_xmm and sourceloc.is_xmm:
+    #                self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si)
+    #                self.mc.PINSRD_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri)
+    #            elif resultloc.is_xmm:
+    #                self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
+    #            else:
+    #                self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
+    #        elif size == 2:
+    #            if resultloc.is_xmm and sourceloc.is_xmm:
+    #                self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si)
+    #                self.mc.PINSRW_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri)
+    #            elif resultloc.is_xmm:
+    #                self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
+    #            else:
+    #                self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
+    #        elif size == 1:
+    #            if resultloc.is_xmm and sourceloc.is_xmm:
+    #                self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si)
+    #                self.mc.PINSRB_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri)
+    #            elif resultloc.is_xmm:
+    #                self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
+    #            else:
+    #                self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
+    #        si += 1
+    #        ri += 1
+    #        k -= 1
+
+    #genop_vec_unpack_i = genop_vec_pack_i
+
+    #def genop_vec_pack_f(self, op, arglocs, resultloc):
+    #    resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+    #    assert isinstance(resloc, RegLoc)
+    #    assert isinstance(srcloc, RegLoc)
+    #    count = countloc.value
+    #    residx = residxloc.value
+    #    srcidx = srcidxloc.value
+    #    size = sizeloc.value
+    #    if size == 4:
+    #        si = srcidx
+    #        ri = residx
+    #        k = count
+    #        while k > 0:
+    #            if resloc.is_xmm:
+    #                src = srcloc.value
+    #                if not srcloc.is_xmm:
+    #                    # if source is a normal register (unpack)
+    #                    assert count == 1
+    #                    assert si == 0
+    #                    self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
+    #                    src = X86_64_XMM_SCRATCH_REG.value
+    #                select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
+    #                self.mc.INSERTPS_xxi(resloc.value, src, select)
+    #            else:
+    #                self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
+    #            si += 1
+    #            ri += 1
+    #            k -= 1
+    #    elif size == 8:
+    #        assert resloc.is_xmm
+    #        if srcloc.is_xmm:
+    #            if srcidx == 0:
+    #                if residx == 0:
+    #                    # r = (s[0], r[1])
+    #                    self.mc.MOVSD(resloc, srcloc)
+    #                else:
+    #                    assert residx == 1
+    #                    # r = (r[0], s[0])
+    #                    self.mc.UNPCKLPD(resloc, srcloc)
+    #            else:
+    #                assert srcidx == 1
+    #                if residx == 0:
+    #                    # r = (s[1], r[1])
+    #                    if resloc != srcloc:
+    #                        self.mc.UNPCKHPD(resloc, srcloc)
+    #                    self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+    #                else:
+    #                    assert residx == 1
+    #                    # r = (r[0], s[1])
+    #                    if resloc != srcloc:
+    #                        self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+    #                        self.mc.UNPCKHPD(resloc, srcloc)
+    #                    # if they are equal nothing is to be done
+
+    #genop_vec_unpack_f = genop_vec_pack_f
+
+    #def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
+    #    self.mc.CVTPD2PS(resloc, arglocs[0])
+
+    #def genop_vec_cast_float_to_int(self, op, arglocs, resloc):
+    #    self.mc.CVTPD2DQ(resloc, arglocs[0])
+
+    #def genop_vec_cast_int_to_float(self, op, arglocs, resloc):
+    #    self.mc.CVTDQ2PD(resloc, arglocs[0])
+
+    #def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
+    #    self.mc.CVTPS2PD(resloc, arglocs[0])
+
+clas#s VectorRegallocMixin(object):
+    #_mixin_ = True
+
+    #def _consider_vec_getarrayitem(self, op):
+    #    descr = op.getdescr()
+    #    assert isinstance(descr, ArrayDescr)
+    #    assert not descr.is_array_of_pointers() and \
+    #           not descr.is_array_of_structs()
+    #    itemsize, ofs, _ = unpack_arraydescr(descr)
+    #    integer = not (descr.is_array_of_floats() or descr.getconcrete_type() == FLOAT)
+    #    aligned = False
+    #    args = op.getarglist()
+    #    base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+    #    ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+    #    result_loc = self.force_allocate_reg(op)
+    #    self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
+    #                      imm(integer), imm(aligned)], result_loc)
+
+    #consider_vec_getarrayitem_raw_i = _consider_vec_getarrayitem
+    #consider_vec_getarrayitem_raw_f = _consider_vec_getarrayitem
+    #consider_vec_getarrayitem_gc_i = _consider_vec_getarrayitem
+    #consider_vec_getarrayitem_gc_f = _consider_vec_getarrayitem
+    #consider_vec_raw_load_i = _consider_vec_getarrayitem
+    #consider_vec_raw_load_f = _consider_vec_getarrayitem
+
+    #def _consider_vec_setarrayitem(self, op):
+    #    descr = op.getdescr()
+    #    assert isinstance(descr, ArrayDescr)
+    #    assert not descr.is_array_of_pointers() and \
+    #           not descr.is_array_of_structs()
+    #    itemsize, ofs, _ = unpack_arraydescr(descr)
+    #    args = op.getarglist()
+    #    base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+    #    value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
+    #    ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+
+    #    integer = not (descr.is_array_of_floats() or descr.getconcrete_type() == FLOAT)
+    #    aligned = False
+    #    self.perform_discard(op, [base_loc, ofs_loc, value_loc,
+    #                             imm(itemsize), imm(ofs), imm(integer), imm(aligned)])
+
+    #consider_vec_setarrayitem_raw = _consider_vec_setarrayitem
+    #consider_vec_setarrayitem_gc = _consider_vec_setarrayitem
+    #consider_vec_raw_store = _consider_vec_setarrayitem
+
+    #def consider_vec_arith(self, op):
+    #    lhs = op.getarg(0)
+    #    assert isinstance(op, VectorOp)
+    #    size = op.bytesize
+    #    args = op.getarglist()
+    #    loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
+    #    loc0 = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    self.perform(op, [loc0, loc1, imm(size)], loc0)
+
+    #consider_vec_int_add = consider_vec_arith
+    #consider_vec_int_sub = consider_vec_arith
+    #consider_vec_int_mul = consider_vec_arith
+    #consider_vec_float_add = consider_vec_arith
+    #consider_vec_float_sub = consider_vec_arith
+    #consider_vec_float_mul = consider_vec_arith
+    #consider_vec_float_truediv = consider_vec_arith
+    #del consider_vec_arith
+
+    #def consider_vec_arith_unary(self, op):
+    #    lhs = op.getarg(0)
+    #    assert isinstance(lhs, VectorOp)
+    #    args = op.getarglist()
+    #    res = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    self.perform(op, [res, imm(lhs.bytesize)], res)
+
+    #consider_vec_float_neg = consider_vec_arith_unary
+    #consider_vec_float_abs = consider_vec_arith_unary
+    #del consider_vec_arith_unary
+
+    #def consider_vec_logic(self, op):
+    #    lhs = op.getarg(0)
+    #    assert isinstance(lhs, VectorOp)
+    #    args = op.getarglist()
+    #    source = self.make_sure_var_in_reg(op.getarg(1), args)
+    #    result = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    self.perform(op, [source, imm(lhs.bytesize)], result)
+
+    #def consider_vec_float_eq(self, op):
+    #    assert isinstance(op, VectorOp)
+    #    lhs = op.getarg(0)
+    #    assert isinstance(lhs, VectorOp)
+    #    args = op.getarglist()
+    #    rhsloc = self.make_sure_var_in_reg(op.getarg(1), args)
+    #    lhsloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
+
+    #consider_vec_float_ne = consider_vec_float_eq
+    #consider_vec_int_eq = consider_vec_float_eq
+    #consider_vec_int_ne = consider_vec_float_eq
+
+    #consider_vec_int_and = consider_vec_logic
+    #consider_vec_int_or = consider_vec_logic
+    #consider_vec_int_xor = consider_vec_logic
+    #del consider_vec_logic
+
+    #def consider_vec_pack_i(self, op):
+    #    # new_res = vec_pack_i(res, src, index, count)
+    #    assert isinstance(op, VectorOp)
+    #    arg = op.getarg(1)
+    #    index = op.getarg(2)
+    #    count = op.getarg(3)
+    #    assert isinstance(index, ConstInt)
+    #    assert isinstance(count, ConstInt)
+    #    args = op.getarglist()
+    #    srcloc = self.make_sure_var_in_reg(arg, args)
+    #    resloc =  self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    residx = index.value # where to put it in result?
+    #    srcidx = 0
+    #    arglocs = [resloc, srcloc, imm(residx), imm(srcidx),
+    #               imm(count.value), imm(op.bytesize)]
+    #    self.perform(op, arglocs, resloc)
+
+    #consider_vec_pack_f = consider_vec_pack_i
+
+    #def consider_vec_unpack_i(self, op):
+    #    assert isinstance(op, VectorOp)
+    #    index = op.getarg(1)
+    #    count = op.getarg(2)
+    #    assert isinstance(index, ConstInt)
+    #    assert isinstance(count, ConstInt)
+    #    args = op.getarglist()
+    #    srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+    #    if op.is_vector():
+    #        resloc =  self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #        size = op.bytesize
+    #    else:
+    #        # unpack into iX box
+    #        resloc =  self.force_allocate_reg(op, args)
+    #        arg = op.getarg(0)
+    #        assert isinstance(arg, VectorOp)
+    #        size = arg.bytesize
+    #    residx = 0
+    #    args = op.getarglist()
+    #    arglocs = [resloc, srcloc, imm(residx), imm(index.value), imm(count.value), imm(size)]
+    #    self.perform(op, arglocs, resloc)
+
+    #consider_vec_unpack_f = consider_vec_unpack_i
+
+    #def consider_vec_expand_f(self, op):
+    #    assert isinstance(op, VectorOp)
+    #    arg = op.getarg(0)
+    #    args = op.getarglist()
+    #    if arg.is_constant():
+    #        resloc = self.xrm.force_allocate_reg(op)
+    #        srcloc = self.xrm.expand_float(op.bytesize, arg)
+    #    else:
+    #        resloc = self.xrm.force_result_in_reg(op, arg, args)
+    #        srcloc = resloc
+    #    self.perform(op, [srcloc, imm(op.bytesize)], resloc)
+
+    #def consider_vec_expand_i(self, op):
+    #    assert isinstance(op, VectorOp)
+    #    arg = op.getarg(0)
+    #    args = op.getarglist()
+    #    if arg.is_constant():
+    #        srcloc = self.rm.convert_to_imm(arg)
+    #    else:
+    #        srcloc = self.make_sure_var_in_reg(arg, args)
+    #    resloc = self.xrm.force_allocate_reg(op, args)
+    #    self.perform(op, [srcloc, imm(op.bytesize)], resloc)
+
+    #def consider_vec_int_signext(self, op):
+    #    assert isinstance(op, VectorOp)
+    #    args = op.getarglist()
+    #    resloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    arg = op.getarg(0)
+    #    assert isinstance(arg, VectorOp)
+    #    size = arg.bytesize
+    #    assert size > 0
+    #    self.perform(op, [resloc, imm(size), imm(op.bytesize)], resloc)
+
+    #def consider_vec_int_is_true(self, op):
+    #    args = op.getarglist()
+    #    arg = op.getarg(0)
+    #    assert isinstance(arg, VectorOp)
+    #    argloc = self.loc(arg)
+    #    resloc = self.xrm.force_result_in_reg(op, arg, args)
+    #    self.perform(op, [resloc,imm(arg.bytesize)], None)
+
+    #def _consider_vec(self, op):
+    #    # pseudo instruction, needed to create a new variable
+    #    self.xrm.force_allocate_reg(op)
+
+    #consider_vec_i = _consider_vec
+    #consider_vec_f = _consider_vec
+
+    #def consider_vec_cast_float_to_int(self, op):
+    #    args = op.getarglist()
+    #    srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+    #    resloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
+    #    self.perform(op, [srcloc], resloc)
+
+    #consider_vec_cast_int_to_float = consider_vec_cast_float_to_int
+    #consider_vec_cast_float_to_singlefloat = consider_vec_cast_float_to_int
+    #consider_vec_cast_singlefloat_to_float = consider_vec_cast_float_to_int
+
+    #def consider_vec_guard_true(self, op):
+    #    arg = op.getarg(0)
+    #    loc = self.loc(arg)
+    #    self.assembler.guard_vector(op, self.loc(arg), True)
+    #    self.perform_guard(op, [], None)
+
+    #def consider_vec_guard_false(self, op):
+    #    arg = op.getarg(0)
+    #    loc = self.loc(arg)
+    #    self.assembler.guard_vector(op, self.loc(arg), False)
+    #    self.perform_guard(op, [], None)
+
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
--- a/rpython/rlib/jit.py
+++ b/rpython/rlib/jit.py
@@ -552,7 +552,8 @@
     'enable_opts': 'INTERNAL USE ONLY (MAY NOT WORK OR LEAD TO CRASHES): '
                    'optimizations to enable, or all = %s' % ENABLE_ALL_OPTS,
     'max_unroll_recursion': 'how many levels deep to unroll a recursive function',
-    'vec': 'turn on the vectorization optimization (vecopt). requires sse4.1',
+    'vec': 'turn on the vectorization optimization (vecopt). ' \
+           'Supports powerpc (SVX), x86 (SSE 4.1)',
     'vec_all': 'try to vectorize trace loops that occur outside of the numpy library.',
     'vec_cost': 'threshold for which traces to bail. 0 means the costs.',
     'vec_length': 'the amount of instructions allowed in "all" traces.',
@@ -575,7 +576,7 @@
               'disable_unrolling': 200,
               'enable_opts': 'all',
               'max_unroll_recursion': 7,
-              'vec': 0,
+              'vec': 1,
               'vec_all': 0,
               'vec_cost': 0,
               'vec_length': 60,