[pypy-commit] pypy ppc-vsx-support: impl. flush_vector_cc for x86 using PBLENDVB
plan_rich
pypy.commits at gmail.com
Mon Aug 1 13:05:03 EDT 2016
Author: Richard Plangger <planrichi at gmail.com>
Branch: ppc-vsx-support
Changeset: r85962:7673f44c3693
Date: 2016-08-01 19:04 +0200
http://bitbucket.org/pypy/pypy/changeset/7673f44c3693/
Log: impl. flush_vector_cc for x86 using PBLENDVB
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -108,9 +108,20 @@
single_neg_const = '\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80'
zero_const = '\x00' * 16
#
+ two_64bit_ones = '\x01\x00\x00\x00\x00\x00\x00\x00' * 2
+ four_32bit_ones = '\x01\x00\x00\x00' * 4
+ eight_16bit_ones = '\x01\x00' * 8
+ sixteen_8bit_ones = '\x01' * 16
+
+
+
+
+
+ #
data = neg_const + abs_const + \
single_neg_const + single_abs_const + \
- zero_const
+ zero_const + sixteen_8bit_ones + eight_16bit_ones + \
+ four_32bit_ones + two_64bit_ones
datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
float_constants = datablockwrapper.malloc_aligned(len(data), alignment=16)
datablockwrapper.done()
@@ -122,6 +133,7 @@
self.single_float_const_neg_addr = float_constants + 32
self.single_float_const_abs_addr = float_constants + 48
self.expand_byte_mask_addr = float_constants + 64
+ self.element_ones = [float_constants + 80 + 16*i for i in range(4)]
def set_extra_stack_depth(self, mc, value):
if self._is_asmgcc():
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -793,6 +793,7 @@
PTEST_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x17', register(1,8), register(2), '\xC0')
PBLENDW_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x0E', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+ PBLENDVB_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x10', register(1,8), register(2), '\xC0')
CMPPD_xxi = xmminsn('\x66', rex_nw, '\x0F\xC2', register(1,8), register(2), '\xC0', immediate(3, 'b'))
CMPPS_xxi = xmminsn( rex_nw, '\x0F\xC2', register(1,8), register(2), '\xC0', immediate(3, 'b'))
diff --git a/rpython/jit/backend/x86/vector_ext.py b/rpython/jit/backend/x86/vector_ext.py
--- a/rpython/jit/backend/x86/vector_ext.py
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -10,7 +10,7 @@
xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG, AddressLoc)
from rpython.jit.backend.llsupport.vector_ext import VectorExt
-from rpython.jit.backend.llsupport.regalloc import get_scale
+from rpython.jit.backend.llsupport.regalloc import get_scale, TempVar
from rpython.jit.metainterp.resoperation import (rop, ResOperation,
VectorOp, VectorGuardOp)
from rpython.rlib.objectmodel import we_are_translated, always_inline
@@ -33,6 +33,14 @@
raise NotImplementedError(msg)
# DUP END
+class TempVector(TempVar):
+ def __init__(self, type):
+ self.type = type
+ def is_vector(self):
+ return True
+ def __repr__(self):
+ return "<TempVector At %s>" % (id(self),)
+
class X86VectorExt(VectorExt):
def setup_once(self, asm):
if detect_feature.detect_sse4_1():
@@ -292,29 +300,50 @@
self.mc.XORPD(src, heap(self.float_const_neg_addr))
def genop_vec_float_eq(self, op, arglocs, resloc):
- _, rhsloc, sizeloc = arglocs
+ lhsloc, rhsloc, sizeloc = arglocs
size = sizeloc.value
if size == 4:
- self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 0) # 0 means equal
+ self.mc.CMPPS_xxi(lhsloc.value, rhsloc.value, 0) # 0 means equal
else:
- self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 0)
+ self.mc.CMPPD_xxi(lhsloc.value, rhsloc.value, 0)
+ self.flush_vec_cc(rx86.Conditions["E"], lhsloc, resloc, sizeloc.value)
+
+ def flush_vec_cc(self, rev_cond, lhsloc, resloc, size):
+ # After emitting an instruction that leaves a boolean result in
+ # a condition code (cc), call this. In the common case, result_loc
+ # will be set to SPP by the regalloc, which in this case means
+ # "propagate it between this operation and the next guard by keeping
+ # it in the cc". In the uncommon case, result_loc is another
+ # register, and we emit a load from the cc into this register.
+
+ if resloc is ebp:
+ self.guard_success_cc = condition
+ else:
+ assert lhsloc is xmm0
+ maskloc = X86_64_XMM_SCRATCH_REG
+ self.mc.MOVAPD(maskloc, heap(self.element_ones[get_scale(size)]))
+ self.mc.PXOR(resloc, resloc)
+ # note that xmm0 contains true false for each element by the last compare operation
+ self.mc.PBLENDVB_xx(resloc.value, maskloc.value)
def genop_vec_float_ne(self, op, arglocs, resloc):
- _, rhsloc, sizeloc = arglocs
+ lhsloc, rhsloc, sizeloc = arglocs
size = sizeloc.value
# b(100) == 1 << 2 means not equal
if size == 4:
- self.mc.CMPPS_xxi(resloc.value, rhsloc.value, 1 << 2)
+ self.mc.CMPPS_xxi(lhsloc.value, rhsloc.value, 1 << 2)
else:
- self.mc.CMPPD_xxi(resloc.value, rhsloc.value, 1 << 2)
+ self.mc.CMPPD_xxi(lhsloc.value, rhsloc.value, 1 << 2)
+ self.flush_vec_cc(rx86.Conditions("NE"), lhsloc, resloc, sizeloc.value)
def genop_vec_int_eq(self, op, arglocs, resloc):
- _, rhsloc, sizeloc = arglocs
+ lhsloc, rhsloc, sizeloc = arglocs
size = sizeloc.value
- self.mc.PCMPEQ(resloc, rhsloc, size)
+ self.mc.PCMPEQ(lhsloc, rhsloc, size)
+ self.flush_vec_cc(rx86.Conditions("E"), lhsloc, resloc, sizeloc.value)
def genop_vec_int_ne(self, op, arglocs, resloc):
- _, rhsloc, sizeloc = arglocs
+ lhsloc, rhsloc, sizeloc = arglocs
size = sizeloc.value
self.mc.PCMPEQ(resloc, rhsloc, size)
temp = X86_64_XMM_SCRATCH_REG
@@ -325,6 +354,7 @@
# 11 11 11 11
# ----------- pxor
# 00 11 00 00
+ self.flush_vec_cc(rx86.Conditions("NE"), lhsloc, resloc, sizeloc.value)
def genop_vec_int_signext(self, op, arglocs, resloc):
srcloc, sizeloc, tosizeloc = arglocs
@@ -599,9 +629,55 @@
lhs = op.getarg(0)
assert isinstance(lhs, VectorOp)
args = op.getarglist()
+ # we need to use xmm0
+ lhsloc = self.enforce_var_in_vector_reg(op.getarg(0), args, selected_reg=xmm0)
rhsloc = self.make_sure_var_in_reg(op.getarg(1), args)
- lhsloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
- self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], lhsloc)
+ resloc = self.force_allocate_vector_reg_or_cc(op)
+ self.perform(op, [lhsloc, rhsloc, imm(lhs.bytesize)], resloc)
+
+ def enforce_var_in_vector_reg(self, arg, forbidden_vars, selected_reg):
+ """ Enforce the allocation in a specific register. This can even be a forbidden
+ register. If it is forbidden, it will be moved to another register.
+ Use with caution, currently this is only used for the vectorization backend
+ instructions.
+ """
+ xrm = self.xrm
+ if selected_reg not in xrm.free_regs:
+ variable = None
+ candidate_to_spill = None
+ for var, reg in self.xrm.reg_bindings.items():
+ if reg is selected_reg:
+ variable = var
+ else:
+ if var not in forbidden_vars:
+ candidate_to_spill = var
+ # do we have a free register?
+ if len(xrm.free_regs) == 0:
+ # spill a non forbidden variable
+ self._spill_var(candidate_to_spill, forbidden_vars, None)
+ loc = xrm.free_regs.pop()
+ self.assembler.mov(selected_reg, loc)
+ reg = xrm.reg_bindings.get(arg, None)
+ if reg:
+ xrm.free_regs.append(reg)
+ self.assembler.mov(reg, selected_reg)
+ xrm.reg_bindings[arg] = selected_reg
+ xrm.reg_bindings[variable] = loc
+
+ return selected_reg
+ return self.make_sure_var_in_reg(arg, forbidden_vars, selected_reg=selected_reg)
+
+ def force_allocate_vector_reg_or_cc(self, var):
+ assert var.type == INT
+ if self.next_op_can_accept_cc(self.operations, self.rm.position):
+ # hack: return the ebp location to mean "lives in CC". This
+ # ebp will not actually be used, and the location will be freed
+ # after the next op as usual.
+ self.xrm.force_allocate_frame_reg(var)
+ return ebp
+ else:
+ # else, return a regular register (not ebp).
+ return self.xrm.force_allocate_reg(var)
consider_vec_float_ne = consider_vec_float_eq
consider_vec_int_eq = consider_vec_float_eq
More information about the pypy-commit
mailing list