[pypy-commit] pypy vecopt: added float_neg and float_abs implementations to x86
plan_rich
noreply at buildbot.pypy.org
Tue May 26 10:42:49 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77560:8b6acbbc80c6
Date: 2015-05-26 09:35 +0200
http://bitbucket.org/pypy/pypy/changeset/8b6acbbc80c6/
Log: added float_neg and float_abs implementations to x86 added a new
single float neg/abs constants to the heap llgraph implementations
for float neg/abs/truediv
diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -683,8 +683,15 @@
exec py.code.Source(vector_arith_code.format('float','add','+')).compile()
exec py.code.Source(vector_arith_code.format('float','sub','-')).compile()
exec py.code.Source(vector_arith_code.format('float','mul','*')).compile()
+ exec py.code.Source(vector_arith_code.format('float','truediv','/')).compile()
exec py.code.Source(vector_arith_code.format('float','eq','==')).compile()
+ def bh_vec_float_neg(self, vx):
+ return [e * -1 for e in vx]
+
+ def bh_vec_float_abs(self, vx):
+ return [abs(e) for e in vx]
+
def bh_vec_float_eq(self, vx, vy):
assert len(vx) == len(vy)
return [_vx == _vy for _vx,_vy in zip(vx,vy)]
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -52,6 +52,8 @@
self.loop_run_counters = []
self.float_const_neg_addr = 0
self.float_const_abs_addr = 0
+ self.single_float_const_neg_addr = 0
+ self.single_float_const_abs_addr = 0
self.malloc_slowpath = 0
self.malloc_slowpath_varsize = 0
self.wb_slowpath = [0, 0, 0, 0, 0]
@@ -92,20 +94,27 @@
self.current_clt = None
def _build_float_constants(self):
+ # 0x80000000000000008000000000000000
+ neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x80'
+ # 0x7FFFFFFFFFFFFFFF7FFFFFFFFFFFFFFF
+ abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
+ # 0x7FFFFFFF7FFFFFFF7FFFFFFF7FFFFFFF
+ single_abs_const = '\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F'
+ # 0x80000000800000008000000080000000
+ single_neg_const = '\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80'
+ #
+ data = neg_const + neg_const + abs_const + abs_const + \
+ single_neg_const + single_abs_const
datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
- float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
+ float_constants = datablockwrapper.malloc_aligned(len(data), alignment=16)
datablockwrapper.done()
addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
- qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
- # 0x8000000000000000
- neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
- # 0x7FFFFFFFFFFFFFFF
- abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
- data = neg_const + qword_padding + abs_const + qword_padding
for i in range(len(data)):
addr[i] = data[i]
self.float_const_neg_addr = float_constants
self.float_const_abs_addr = float_constants + 16
+ self.single_float_const_neg_addr = float_constants + 32
+ self.single_float_const_abs_addr = float_constants + 48
def set_extra_stack_depth(self, mc, value):
if self._is_asmgcc():
@@ -2564,12 +2573,36 @@
elif itemsize == 8:
self.mc.{p_op_d}(loc0, loc1)
"""
- for op in ['add','mul','sub','div']:
+ for op in ['add','mul','sub']:
OP = op.upper()
_source = genop_vec_float_arith.format(type=op, p_op_s=OP+'PS',p_op_d=OP+'PD')
exec py.code.Source(_source).compile()
del genop_vec_float_arith
+ def genop_vec_float_truediv(self, op, arglocs, resloc):
+ loc0, loc1, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.DIVPS(loc0, loc1)
+ elif size == 8:
+ self.mc.DIVPD(loc0, loc1)
+
+ def genop_vec_float_abs(self, op, arglocs, resloc):
+ src, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
+ elif size == 8:
+ self.mc.ANDPD(src, heap(self.float_const_abs_addr))
+
+ def genop_vec_float_neg(self, op, arglocs, resloc):
+ src, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
+ elif size == 8:
+ self.mc.XORPD(src, heap(self.float_const_neg_addr))
+
def genop_vec_int_signext(self, op, arglocs, resloc):
srcloc, sizeloc, tosizeloc = arglocs
size = sizeloc.value
@@ -2590,15 +2623,18 @@
self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
self.mc.PINSRD_xri(resloc.value, scratch, 1)
else:
- raise NotImplementedError("sign ext missing")
+ raise NotImplementedError("sign ext missing: " + str(size) + " -> " + str(tosize))
def genop_vec_float_expand(self, op, arglocs, resloc):
- loc0, sizeloc, countloc = arglocs
- count = countloc.value
- if count == 1:
- raise NotImplementedError("expand count 1")
- elif count == 2:
- self.mc.MOVDDUP(resloc, loc0)
+ srcloc, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ # the register allocator forces src to be the same as resloc
+ # r = (s[0], s[0], r[0], r[0])
+ # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
+ self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
+ elif size == 8:
+ self.mc.MOVDDUP(resloc, srcloc)
def genop_vec_int_pack(self, op, arglocs, resloc):
resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1523,8 +1523,21 @@
consider_vec_float_add = consider_vec_arith
consider_vec_float_sub = consider_vec_arith
consider_vec_float_mul = consider_vec_arith
+ consider_vec_float_truediv = consider_vec_arith
del consider_vec_arith
+ def consider_vec_arith_unary(self, op):
+ lhs = op.getarg(0)
+ assert isinstance(lhs, BoxVector)
+ size = lhs.item_size
+ args = op.getarglist()
+ res = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [res, imm(size)], res)
+
+ consider_vec_float_neg = consider_vec_arith_unary
+ consider_vec_float_abs = consider_vec_arith_unary
+ def consider_vec_arith_unary
+
def consider_vec_logic(self, op):
lhs = op.getarg(0)
assert isinstance(lhs, BoxVector)
@@ -1583,13 +1596,12 @@
def consider_vec_float_expand(self, op):
args = op.getarglist()
- srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
- resloc = self.force_allocate_reg(op.result, args)
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
vres = op.result
assert isinstance(vres, BoxVector)
count = vres.getcount()
size = vres.getsize()
- self.perform(op, [srcloc, imm(size), imm(count)], resloc)
+ self.perform(op, [resloc, imm(size), imm(count)], resloc)
def consider_vec_int_signext(self, op):
args = op.getarglist()
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -943,8 +943,9 @@
define_modrm_modes('DIVSD_x*', ['\xF2', rex_nw, '\x0F\x5E', register(1, 8)], regtype='XMM')
define_modrm_modes('UCOMISD_x*', ['\x66', rex_nw, '\x0F\x2E', register(1, 8)], regtype='XMM')
define_modrm_modes('XORPD_x*', ['\x66', rex_nw, '\x0F\x57', register(1, 8)], regtype='XMM')
-define_modrm_modes('XORPS_x*', [rex_nw, '\x0F\x57', register(1, 8)], regtype='XMM')
+define_modrm_modes('XORPS_x*', [ rex_nw, '\x0F\x57', register(1, 8)], regtype='XMM')
define_modrm_modes('ANDPD_x*', ['\x66', rex_nw, '\x0F\x54', register(1, 8)], regtype='XMM')
+define_modrm_modes('ANDPS_x*', [ rex_nw, '\x0F\x54', register(1, 8)], regtype='XMM')
# floating point operations (single & double)
define_modrm_modes('ADDPD_x*', ['\x66', rex_nw, '\x0F\x58', register(1, 8)], regtype='XMM')
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1212,8 +1212,7 @@
def test_reduction_basic(self):
trace = """
- [p0, p1, p2, p3, p4]
- label(p5, i6, p2, i7, p1, p8, i9, i10, f11, i12, i13, i14)
+ [p5, i6, p2, i7, p1, p8, i9, i10, f11, i12, i13, i14]
guard_early_exit() [p2, p1, p5, f11, i9, i6, i10, i7, p8]
f15 = raw_load(i12, i10, descr=floatarraydescr)
guard_not_invalidated() [p2, p1, f15, p5, f11, i9, i6, i10, i7, p8]
@@ -1226,6 +1225,9 @@
guard_false(i23) [p2, p1, i20, i18, f16, i22, p5, None, None, i6, None, None, p8]
jump(p5, i6, p2, i18, p1, p8, i20, i22, f16, i12, i13, i14)
"""
+ opt = self.vectorize(self.parse_loop(trace))
+ self.debug_print_operations(opt.loop)
+ return
pass # TODO
trace = """
# Loop unroll (pre vectorize) : -2 with 23 ops
More information about the pypy-commit
mailing list