[pypy-commit] pypy ppc-vsx-support: add a many details to implement the reduction pattern (ppc, partly working already)
plan_rich
pypy.commits at gmail.com
Fri Jul 1 08:01:10 EDT 2016
Author: Richard Plangger <planrichi at gmail.com>
Branch: ppc-vsx-support
Changeset: r85487:1360aa62b1ed
Date: 2016-07-01 14:00 +0200
http://bitbucket.org/pypy/pypy/changeset/1360aa62b1ed/
Log: add a many details to implement the reduction pattern (ppc, partly
working already)
diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -605,12 +605,14 @@
# add
xvadddp = XX3(60, XO9=96)
xvaddsp = XX3(60, XO9=64)
+ xsadddp = XX3(60, XO9=32)
# sub
xvsubdp = XX3(60, XO9=104)
xvsubsp = XX3(60, XO9=72)
# mul
xvmuldp = XX3(60, XO9=112)
xvmulsp = XX3(60, XO9=80)
+ xsmuldp = XX3(60, XO9=46)
# div
xvdivdp = XX3(60, XO9=102)
xvdivsp = XX3(60, XO9=88)
@@ -662,6 +664,12 @@
# generic splat
xxspltd = XX3_splat(60, XO13=10, OE=0)
+ xxlxor = XX3(60, XO9=154)
+ xxlor = XX3(60, XO9=146)
+
+ # vector move register is alias to vector or
+ xvmr = xxlor
+
# INTEGER
# -------
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -771,7 +771,7 @@
self.update_frame_depth(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
#
size_excluding_failure_stuff = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
full_size = self.mc.get_relative_pos()
#
self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
@@ -852,10 +852,12 @@
self.reserve_gcref_table(allgcrefs)
startpos = self.mc.get_relative_pos()
+ self._update_at_exit(arglocs, inputargs, faildescr, regalloc)
+
self._check_frame_depth(self.mc, regalloc.get_gcmap())
frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, operations)
codeendpos = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
fullsize = self.mc.get_relative_pos()
#
self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
@@ -928,7 +930,7 @@
ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
mc.store(r.SCRATCH.value, r.SPP.value, ofs)
- def break_long_loop(self):
+ def break_long_loop(self, regalloc):
# If the loop is too long, the guards in it will jump forward
# more than 32 KB. We use an approximate hack to know if we
# should break the loop here with an unconditional "b" that
@@ -936,15 +938,19 @@
jmp_pos = self.mc.currpos()
self.mc.trap()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
currpos = self.mc.currpos()
pmc = OverwritingBuilder(self.mc, jmp_pos, 1)
pmc.b(currpos - jmp_pos)
pmc.overwrite()
- def generate_quick_failure(self, guardtok):
+ def generate_quick_failure(self, guardtok, regalloc):
startpos = self.mc.currpos()
+ #
+ self._update_at_exit(guardtok.fail_locs, guardtok.failargs,
+ guardtok.faildescr, regalloc)
+ #
faildescrindex, target = self.store_info_on_descr(startpos, guardtok)
assert target != 0
self.mc.load_imm(r.r2, target)
@@ -957,13 +963,13 @@
self.mc.trap()
return startpos
- def write_pending_failure_recoveries(self):
+ def write_pending_failure_recoveries(self, regalloc):
# for each pending guard, generate the code of the recovery stub
# at the end of self.mc.
for i in range(self.pending_guard_tokens_recovered,
len(self.pending_guard_tokens)):
tok = self.pending_guard_tokens[i]
- tok.pos_recovery_stub = self.generate_quick_failure(tok)
+ tok.pos_recovery_stub = self.generate_quick_failure(tok, regalloc)
self.pending_guard_tokens_recovered = len(self.pending_guard_tokens)
def patch_pending_failure_recoveries(self, rawstart):
@@ -1358,6 +1364,60 @@
self.mc.load_imm(r.SCRATCH, fail_index)
self.mc.store(r.SCRATCH.value, r.SPP.value, FORCE_INDEX_OFS)
+ def stitch_bridge(self, faildescr, target):
+ """ Stitching means that one can enter a bridge with a complete different register
+ allocation. This needs remapping which is done here for both normal registers
+ and accumulation registers.
+ """
+ import pdb; pdb.set_trace()
+ asminfo, bridge_faildescr, version, looptoken = target
+ assert isinstance(bridge_faildescr, ResumeGuardDescr)
+ assert isinstance(faildescr, ResumeGuardDescr)
+ assert asminfo.rawstart != 0
+ self.mc = codebuf.MachineCodeBlockWrapper()
+ allblocks = self.get_asmmemmgr_blocks(looptoken)
+ self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
+ allblocks)
+ frame_info = self.datablockwrapper.malloc_aligned(
+ jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
+
+ self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
+ # if accumulation is saved at the guard, we need to update it here!
+ guard_locs = self.rebuild_faillocs_from_descr(faildescr, version.inputargs)
+ bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr, version.inputargs)
+ #import pdb; pdb.set_trace()
+ guard_accum_info = faildescr.rd_vector_info
+ # O(n**2), but usually you only have at most 1 fail argument
+ while guard_accum_info:
+ bridge_accum_info = bridge_faildescr.rd_vector_info
+ while bridge_accum_info:
+ if bridge_accum_info.failargs_pos == guard_accum_info.failargs_pos:
+ # the mapping might be wrong!
+ if bridge_accum_info.location is not guard_accum_info.location:
+ self.mov(guard_accum_info.location, bridge_accum_info.location)
+ bridge_accum_info = bridge_accum_info.next()
+ guard_accum_info = guard_accum_info.next()
+
+ # register mapping is most likely NOT valid, thus remap it in this
+ # short piece of assembler
+ assert len(guard_locs) == len(bridge_locs)
+ for i,gloc in enumerate(guard_locs):
+ bloc = bridge_locs[i]
+ bstack = bloc.location_code() == 'b'
+ gstack = gloc.location_code() == 'b'
+ if bstack and gstack:
+ pass
+ elif gloc is not bloc:
+ self.mov(gloc, bloc)
+ offset = self.mc.get_relative_pos()
+ self.mc.JMP_l(0)
+ self.mc.writeimm32(0)
+ self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
+ rawstart = self.materialize_loop(looptoken)
+ # update the jump (above) to the real trace
+ self._patch_jump_to(rawstart + offset, asminfo.rawstart)
+ # update the guard to jump right to this custom piece of assembler
+ self.patch_jump_for_descr(faildescr, rawstart)
def notimplemented_op(self, op, arglocs, regalloc):
msg = '[PPC/asm] %s not implemented\n' % op.getopname()
diff --git a/rpython/jit/backend/ppc/regalloc.py b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -10,7 +10,8 @@
from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg, check_imm_box
from rpython.jit.backend.ppc.helper import regalloc as helper
from rpython.jit.metainterp.history import (Const, ConstInt, ConstFloat, ConstPtr,
- INT, REF, FLOAT, VOID, VECTOR)
+ INT, REF, FLOAT, VOID, VECTOR,
+ AbstractFailDescr)
from rpython.jit.metainterp.history import JitCellToken, TargetToken
from rpython.jit.metainterp.resoperation import rop
from rpython.jit.backend.ppc import locations
@@ -370,7 +371,7 @@
self.vrm._check_invariants()
self.ivrm._check_invariants()
if self.assembler.mc.get_relative_pos() > self.limit_loop_break:
- self.assembler.break_long_loop()
+ self.assembler.break_long_loop(self)
self.limit_loop_break = (self.assembler.mc.get_relative_pos() +
LIMIT_LOOP_BREAK)
i += 1
@@ -411,10 +412,16 @@
return gcmap
def loc(self, var):
- if var.type == FLOAT:
- return self.fprm.loc(var)
+ if var.is_vector():
+ if var.type == FLOAT:
+ return self.vrm.loc(var)
+ else:
+ return self.ivrm.loc(var)
else:
- return self.rm.loc(var)
+ if var.type == FLOAT:
+ return self.fprm.loc(var)
+ else:
+ return self.rm.loc(var)
def next_instruction(self):
self.rm.next_instruction()
@@ -607,11 +614,24 @@
args.append(self.loc(arg))
else:
args.append(None)
- self.possibly_free_vars(op.getfailargs())
#
# generate_quick_failure() produces up to 14 instructions per guard
self.limit_loop_break -= 14 * 4
- #
+ # specifically for vecopt
+ descr = op.getdescr()
+ if not descr:
+ return args
+ assert isinstance(descr, AbstractFailDescr)
+ if descr.rd_vector_info:
+ accuminfo = descr.rd_vector_info
+ while accuminfo:
+ i = accuminfo.getpos_in_failargs()+1
+ accuminfo.location = args[i]
+ loc = self.loc(accuminfo.getoriginal())
+ args[i] = loc
+ accuminfo = accuminfo.next()
+
+ self.possibly_free_vars(op.getfailargs())
return args
def load_condition_into_cc(self, box):
diff --git a/rpython/jit/backend/ppc/runner.py b/rpython/jit/backend/ppc/runner.py
--- a/rpython/jit/backend/ppc/runner.py
+++ b/rpython/jit/backend/ppc/runner.py
@@ -51,7 +51,7 @@
if detect_vsx():
self.vector_ext = AltiVectorExt()
self.vector_extension = True
- # ??? self.vector_horizontal_operations = True
+ self.vector_horizontal_operations = True
self.assembler.setup_once_vector()
@rgc.no_release_gil
diff --git a/rpython/jit/backend/ppc/vector_ext.py b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -10,7 +10,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.rtyper.lltypesystem.lloperation import llop
from rpython.rtyper.lltypesystem import lltype
-from rpython.jit.backend.ppc.locations import imm
+from rpython.jit.backend.ppc.locations import imm, RegisterLocation
from rpython.jit.backend.ppc.arch import IS_BIG_ENDIAN
from rpython.jit.backend.llsupport.vector_ext import VectorExt
from rpython.jit.backend.ppc.arch import PARAM_SAVE_AREA_OFFSET
@@ -105,7 +105,6 @@
self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
else:
self.mc.vperm(resloc.value, Vlo, Vhi, Vp)
- #self.mc.trap()
def _emit_vec_setitem(self, op, arglocs, regalloc):
# prepares item scale (raw_store does not)
@@ -318,60 +317,57 @@
# index += 1
# self.mc.PBLENDW_xxi(loc.value, temp.value, select)
- #def _update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
- # """ If accumulation is done in this loop, at the guard exit
- # some vector registers must be adjusted to yield the correct value
- # """
- # if not isinstance(faildescr, ResumeGuardDescr):
- # return
- # assert regalloc is not None
- # accum_info = faildescr.rd_vector_info
- # while accum_info:
- # pos = accum_info.getpos_in_failargs()
- # scalar_loc = fail_locs[pos]
- # vector_loc = accum_info.location
- # # the upper elements will be lost if saved to the stack!
- # scalar_arg = accum_info.getoriginal()
- # assert isinstance(vector_loc, RegLoc)
- # if not isinstance(scalar_loc, RegLoc):
- # scalar_loc = regalloc.force_allocate_reg(scalar_arg)
- # assert scalar_arg is not None
- # if accum_info.accum_operation == '+':
- # self._accum_reduce_sum(scalar_arg, vector_loc, scalar_loc)
- # elif accum_info.accum_operation == '*':
- # self._accum_reduce_mul(scalar_arg, vector_loc, scalar_loc)
- # else:
- # not_implemented("accum operator %s not implemented" %
- # (accum_info.accum_operation))
- # accum_info = accum_info.next()
+ def _update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
+ """ If accumulation is done in this loop, at the guard exit
+ some vector registers must be adjusted to yield the correct value
+ """
+ if not isinstance(faildescr, ResumeGuardDescr):
+ return
+ accum_info = faildescr.rd_vector_info
+ while accum_info:
+ pos = accum_info.getpos_in_failargs()
+ scalar_loc = fail_locs[pos]
+ vector_loc = accum_info.location
+ # the upper elements will be lost if saved to the stack!
+ scalar_arg = accum_info.getoriginal()
+ if not scalar_loc.is_reg():
+ scalar_loc = regalloc.force_allocate_reg(scalar_arg)
+ assert scalar_arg is not None
+ if accum_info.accum_operation == '+':
+ self._accum_reduce_sum(scalar_arg, vector_loc, scalar_loc)
+ elif accum_info.accum_operation == '*':
+ self._accum_reduce_mul(scalar_arg, vector_loc, scalar_loc)
+ else:
+ not_implemented("accum operator %s not implemented" %
+ (accum_info.accum_operation))
+ accum_info = accum_info.next()
- #def _accum_reduce_mul(self, arg, accumloc, targetloc):
- # scratchloc = X86_64_XMM_SCRATCH_REG
- # self.mov(accumloc, scratchloc)
- # # swap the two elements
- # self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
- # self.mc.MULSD(accumloc, scratchloc)
- # if accumloc is not targetloc:
- # self.mov(accumloc, targetloc)
+ def _accum_reduce_mul(self, arg, accumloc, targetloc):
+ notimplemented("[ppc reduce mul]")
+ #scratchloc = X86_64_XMM_SCRATCH_REG
+ #self.mov(accumloc, scratchloc)
+ ## swap the two elements
+ #self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
+ #self.mc.MULSD(accumloc, scratchloc)
+ #if accumloc is not targetloc:
+ # self.mov(accumloc, targetloc)
- #def _accum_reduce_sum(self, arg, accumloc, targetloc):
- # # Currently the accumulator can ONLY be the biggest
- # # size for X86 -> 64 bit float/int
- # if arg.type == FLOAT:
- # # r = (r[0]+r[1],r[0]+r[1])
- # self.mc.HADDPD(accumloc, accumloc)
- # # upper bits (> 64) are dirty (but does not matter)
- # if accumloc is not targetloc:
- # self.mov(accumloc, targetloc)
- # return
- # elif arg.type == INT:
- # scratchloc = X86_64_SCRATCH_REG
- # self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
- # self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
- # self.mc.ADD(targetloc, scratchloc)
- # return
+ def _accum_reduce_sum(self, arg, accumloc, targetloc):
+ # Currently the accumulator can ONLY be the biggest
+ # 64 bit float/int
+ tgt = targetloc.value
+ acc = accumloc.value
+ if arg.type == FLOAT:
+ # r = (r[0]+r[1],r[0]+r[1])
+ self.mc.xvmr(tgt, acc, acc)
+ if IS_BIG_ENDIAN:
+ self.mc.xxspltd(tgt, acc, acc, 0b00)
+ else:
+ self.mc.xxspltd(tgt, acc, acc, 0b01)
+ self.mc.xsadddp(tgt, tgt, acc)
+ return
- # not_implemented("reduce sum for %s not impl." % arg)
+ not_implemented("reduce sum for %s not impl." % arg)
def emit_vec_int_is_true(self, op, arglocs, regalloc):
resloc, argloc, sizeloc = arglocs
@@ -408,6 +404,13 @@
self.mc.lvx(resloc.value, off, r.SP.value)
flush_vec_cc(self, regalloc, c.EQ, op.bytesize, resloc)
+ def emit_vec_float_xor(self, op, arglocs, regalloc):
+ resloc, l0, l1, sizeloc = arglocs
+ res = resloc.value
+ r0 = l0.value
+ r1 = l1.value
+ self.mc.xxlxor(res, r0, r1)
+
def emit_vec_float_ne(self, op, arglocs, regalloc):
resloc, loc1, loc2, sizeloc = arglocs
size = sizeloc.value
@@ -565,61 +568,49 @@
#genop_vec_unpack_i = genop_vec_pack_i
- #def genop_vec_pack_f(self, op, arglocs, resultloc):
- # resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
- # assert isinstance(resloc, RegLoc)
- # assert isinstance(srcloc, RegLoc)
- # count = countloc.value
- # residx = residxloc.value
- # srcidx = srcidxloc.value
- # size = sizeloc.value
- # if size == 4:
- # si = srcidx
- # ri = residx
- # k = count
- # while k > 0:
- # if resloc.is_xmm:
- # src = srcloc.value
- # if not srcloc.is_xmm:
- # # if source is a normal register (unpack)
- # assert count == 1
- # assert si == 0
- # self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
- # src = X86_64_XMM_SCRATCH_REG.value
- # select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
- # self.mc.INSERTPS_xxi(resloc.value, src, select)
- # else:
- # self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
- # si += 1
- # ri += 1
- # k -= 1
- # elif size == 8:
- # assert resloc.is_xmm
- # if srcloc.is_xmm:
- # if srcidx == 0:
- # if residx == 0:
- # # r = (s[0], r[1])
- # self.mc.MOVSD(resloc, srcloc)
- # else:
- # assert residx == 1
- # # r = (r[0], s[0])
- # self.mc.UNPCKLPD(resloc, srcloc)
- # else:
- # assert srcidx == 1
- # if residx == 0:
- # # r = (s[1], r[1])
- # if resloc != srcloc:
- # self.mc.UNPCKHPD(resloc, srcloc)
- # self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
- # else:
- # assert residx == 1
- # # r = (r[0], s[1])
- # if resloc != srcloc:
- # self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
- # self.mc.UNPCKHPD(resloc, srcloc)
- # # if they are equal nothing is to be done
+ def emit_vec_pack_f(self, op, arglocs, resultloc):
+ resloc, vloc, srcloc, residxloc, srcidxloc, countloc = arglocs
+ vec = vloc.value
+ res = resloc.value
+ src = srcloc.value
+ count = countloc.value
+ residx = residxloc.value
+ srcidx = srcidxloc.value
+ size = op.bytesize
+ assert size == 8
+ # srcloc is always a floating point register f, this means it is
+ # vsr[0] == valueof(f)
+ if srcidx == 0:
+ if residx == 0:
+ # r = (s[0], r[1])
+ if IS_BIG_ENDIAN:
+ self.mc.xxspltd(res, src, vec, 0b10)
+ else:
+ self.mc.xxspltd(res, src, vec, 0b01)
+ else:
+ assert residx == 1
+ # r = (r[0], s[0])
+ if IS_BIG_ENDIAN:
+ self.mc.xxspltd(res, vec, src, 0b00)
+ else:
+ self.mc.xxspltd(res, vec, src, 0b11)
+ else:
+ assert srcidx == 1
+ if residx == 0:
+ # r = (s[1], r[1])
+ if IS_BIG_ENDIAN:
+ self.mc.xxspltd(res, src, vec, 0b11)
+ else:
+ self.mc.xxspltd(res, src, vec, 0b00)
+ else:
+ assert residx == 1
+ # r = (r[0], s[1])
+ if IS_BIG_ENDIAN:
+ self.mc.xxspltd(res, vec, src, 0b10)
+ else:
+ self.mc.xxspltd(res, vec, src, 0b01)
- #genop_vec_unpack_f = genop_vec_pack_f
+ emit_vec_unpack_f = emit_vec_pack_f
# needed as soon as PPC's support_singlefloat is implemented!
#def genop_vec_cast_float_to_int(self, op, arglocs, regalloc):
@@ -627,6 +618,10 @@
#def genop_vec_cast_singlefloat_to_float(self, op, arglocs, regalloc):
# self.mc.CVTPS2PD(resloc, arglocs[0])
+ def emit_vec_f(self, op, arglocs, regalloc):
+ pass
+ emit_vec_i = emit_vec_f
+
class VectorRegalloc(object):
_mixin_ = True
@@ -709,9 +704,10 @@
prepare_vec_int_xor = prepare_vec_arith
prepare_vec_float_eq = prepare_vec_arith
- prepare_vec_float_ne = prepare_vec_float_eq
- prepare_vec_int_eq = prepare_vec_float_eq
- prepare_vec_int_ne = prepare_vec_float_eq
+ prepare_vec_float_ne = prepare_vec_arith
+ prepare_vec_int_eq = prepare_vec_arith
+ prepare_vec_int_ne = prepare_vec_arith
+ prepare_vec_float_xor = prepare_vec_arith
del prepare_vec_arith
@@ -751,24 +747,35 @@
prepare_vec_float_abs = prepare_vec_arith_unary
del prepare_vec_arith_unary
- #def prepare_vec_pack_i(self, op):
- # # new_res = vec_pack_i(res, src, index, count)
- # assert isinstance(op, VectorOp)
- # arg = op.getarg(1)
- # index = op.getarg(2)
- # count = op.getarg(3)
- # assert isinstance(index, ConstInt)
- # assert isinstance(count, ConstInt)
- # args = op.getarglist()
- # srcloc = self.make_sure_var_in_reg(arg, args)
- # resloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
- # residx = index.value # where to put it in result?
- # srcidx = 0
- # arglocs = [resloc, srcloc, imm(residx), imm(srcidx),
- # imm(count.value), imm(op.bytesize)]
- # self.perform(op, arglocs, resloc)
+ def prepare_vec_pack_i(self, op):
+ # new_res = vec_pack_i(res, src, index, count)
+ assert isinstance(op, VectorOp)
+ arg = op.getarg(1)
+ index = op.getarg(2)
+ count = op.getarg(3)
+ assert isinstance(index, ConstInt)
+ assert isinstance(count, ConstInt)
+ srcloc = self.ensure_vector_reg(arg)
+ resloc = self.force_allocate_vector_reg(op)
+ residx = index.value # where to put it in result?
+ srcidx = 0
+ return [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value)]
- #prepare_vec_pack_f = prepare_vec_pack_i
+ def prepare_vec_pack_f(self, op):
+ # new_res = vec_pack_i(res, src, index, count)
+ assert isinstance(op, VectorOp)
+ arg = op.getarg(1)
+ index = op.getarg(2)
+ count = op.getarg(3)
+ assert isinstance(index, ConstInt)
+ assert isinstance(count, ConstInt)
+ assert not arg.is_vector()
+ srcloc = self.ensure_reg(arg)
+ vloc = self.ensure_vector_reg(op.getarg(0))
+ resloc = self.force_allocate_vector_reg(op)
+ residx = index.value # where to put it in result?
+ srcidx = 0
+ return [resloc, vloc, srcloc, imm(residx), imm(srcidx), imm(count.value)]
#def prepare_vec_unpack_i(self, op):
# assert isinstance(op, VectorOp)
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -645,7 +645,7 @@
bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr, version.inputargs)
#import pdb; pdb.set_trace()
guard_accum_info = faildescr.rd_vector_info
- # O(n^2), but usually you only have at most 1 fail argument
+ # O(n**2), but usually you only have at most 1 fail argument
while guard_accum_info:
bridge_accum_info = bridge_faildescr.rd_vector_info
while bridge_accum_info:
diff --git a/rpython/jit/metainterp/optimizeopt/vector.py b/rpython/jit/metainterp/optimizeopt/vector.py
--- a/rpython/jit/metainterp/optimizeopt/vector.py
+++ b/rpython/jit/metainterp/optimizeopt/vector.py
@@ -793,7 +793,10 @@
if pack.reduce_init() == 0:
vecop = OpHelpers.create_vec(datatype, bytesize, signed, count)
oplist.append(vecop)
- vecop = VecOperation(rop.VEC_INT_XOR, [vecop, vecop],
+ opnum = rop.VEC_INT_XOR
+ if datatype == FLOAT:
+ opnum = rop.VEC_FLOAT_XOR
+ vecop = VecOperation(opnum, [vecop, vecop],
vecop, count)
oplist.append(vecop)
elif pack.reduce_init() == 1:
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -994,6 +994,7 @@
'_VEC_ARITHMETIC_LAST',
'VEC_FLOAT_EQ/2b/i',
'VEC_FLOAT_NE/2b/i',
+ 'VEC_FLOAT_XOR/2/f',
'VEC_INT_IS_TRUE/1b/i',
'VEC_INT_NE/2b/i',
'VEC_INT_EQ/2b/i',
diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -78,6 +78,7 @@
enable_opts = 'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
def setup_method(self, method):
+ import pdb; pdb.set_trace()
if not self.supports_vector_ext():
py.test.skip("this cpu %s has no implemented vector backend" % CPU)
More information about the pypy-commit
mailing list