[pypy-commit] pypy vecopt: added many opcodes for x86 that operate on packed data (single float)
plan_rich
noreply at buildbot.pypy.org
Wed May 13 16:08:19 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r77308:9a5fe21d3676
Date: 2015-05-13 15:53 +0200
http://bitbucket.org/pypy/pypy/changeset/9a5fe21d3676/
Log: added many opcodes for x86 that operate on packed data (single
float) added a test to stress castup/cast down extended the array
descriptor with a field that tracks if it loads from a float32
(necessary for vectorization) there might be another solution for
that, but this needs to be discussed
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -97,6 +97,8 @@
def test_add_float(self):
result = self.run("add_float")
assert result == 3 + 3
+
+ def test_add_float32(self):
result = self.run("add_float32")
assert result == 3.0 + 3.0
@@ -116,6 +118,10 @@
result = self.run("add_float_const")
assert result == 29.0 + 3.0
self.check_trace_count(1)
+ def test_add_float22_const(self):
+ result = self.run("add_float_const")
+ assert result == 29.0 + 3.0
+ self.check_trace_count(1)
result = self.run("add_float32_const")
assert result == 29.0 + 3.0
self.check_trace_count(1)
diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -689,8 +689,6 @@
assert len(vx) == count
assert len(vy) == count
return [_vx == _vy for _vx,_vy in zip(vx,vy)]
- bh_vec_float_eq.argtypes = ['f','f','i']
- bh_vec_float_eq.resulttype = 'i'
def bh_vec_cast_float_to_singlefloat(self, vx):
return vx
diff --git a/rpython/jit/backend/llsupport/descr.py b/rpython/jit/backend/llsupport/descr.py
--- a/rpython/jit/backend/llsupport/descr.py
+++ b/rpython/jit/backend/llsupport/descr.py
@@ -192,6 +192,7 @@
lendescr = None
flag = '\x00'
vinfo = None
+ loaded_float = False
def __init__(self, basesize, itemsize, lendescr, flag):
self.basesize = basesize
@@ -260,6 +261,10 @@
lendescr = get_field_arraylen_descr(gccache, ARRAY_OR_STRUCT)
flag = get_type_flag(ARRAY_INSIDE.OF)
arraydescr = ArrayDescr(basesize, itemsize, lendescr, flag)
+ if ARRAY_INSIDE.OF is lltype.SingleFloat:
+ # it would be optimal to set the flag as FLOAT_TYPE
+ # but it is not possible???
+ arraydescr.loaded_float = True
if ARRAY_OR_STRUCT._gckind == 'gc':
gccache.init_array_descr(ARRAY_OR_STRUCT, arraydescr)
cache[ARRAY_OR_STRUCT] = arraydescr
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -7,7 +7,7 @@
DEBUG_COUNTER, debug_bridge)
from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
-from rpython.jit.metainterp.history import Const, Box, VOID
+from rpython.jit.metainterp.history import Const, Box, VOID, BoxVector
from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
from rpython.rtyper.lltypesystem.lloperation import llop
@@ -2473,10 +2473,10 @@
else:
self.mc.MOVDQU(resloc, src_addr)
else:
- if itemsize == 8: # TODO is there a constant for double floating point size?
+ if itemsize == 4:
+ self.mc.MOVSS(resloc, src_addr)
+ elif itemsize == 8:
self.mc.MOVSD(resloc, src_addr)
- else:
- raise NotImplementedError
def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
# considers item scale (raw_store does not)
@@ -2499,10 +2499,10 @@
else:
self.mc.MOVDQU(dest_loc, value_loc)
else:
- if itemsize == 8: # TODO is there a constant for double floating point size?
+ if itemsize == 4:
+ self.mc.MOVSS(dest_loc, value_loc)
+ elif itemsize == 8:
self.mc.MOVSD(dest_loc, value_loc)
- else:
- raise NotImplementedError
def genop_vec_int_add(self, op, arglocs, resloc):
loc0, loc1, itemsize_loc = arglocs
@@ -2515,8 +2515,6 @@
self.mc.PADDD(loc0, loc1)
elif itemsize == 8:
self.mc.PADDQ(loc0, loc1)
- else:
- raise NotImplementedError
def genop_vec_int_sub(self, op, arglocs, resloc):
loc0, loc1, itemsize_loc = arglocs
@@ -2529,8 +2527,6 @@
self.mc.PSUBD(loc0, loc1)
elif itemsize == 8:
self.mc.PSUBQ(loc0, loc1)
- else:
- raise NotImplementedError
genop_vec_float_arith = """
def genop_vec_float_{type}(self, op, arglocs, resloc):
@@ -2540,8 +2536,6 @@
self.mc.{p_op_s}(loc0, loc1)
elif itemsize == 8:
self.mc.{p_op_d}(loc0, loc1)
- else:
- raise NotImplementedError
"""
for op in ['add','mul','sub','div']:
OP = op.upper()
@@ -2549,34 +2543,88 @@
exec py.code.Source(_source).compile()
del genop_vec_float_arith
+ def genop_vec_int_signext(self, op, arglocs, resloc):
+ pass
+
def genop_vec_expand(self, op, arglocs, resloc):
- loc0, sizeloc = arglocs
- size = sizeloc.value
- if size == 2:
- pass
-
+ loc0, countloc = arglocs
+ count = countloc.value
+ if count == 1:
+ raise NotImplementedError("expand count 1")
+ elif count == 2:
+ self.mc.MOVDDUP(resloc, loc0)
def genop_vec_box_unpack(self, op, arglocs, resloc):
- loc0, indexloc, sizeloc = arglocs
+ loc0, tmploc, indexloc, countloc = arglocs
+ count = countloc.value
+ index = indexloc.value
+ box = op.getarg(0)
+ assert isinstance(box, BoxVector)
+ item_type = box.item_type
+ size = box.item_size
+ if size == 4:
+ tmploc = self._shuffle_by_index(loc0, tmploc, item_type, size, index, count)
+ self.mc.MOVD32_rx(resloc.value, tmploc.value)
+ elif size == 8:
+ if index == 0:
+ self.mc.UNPCKLPD(resloc, loc0)
+ else:
+ self.mc.UNPCKHPD(resloc, loc0)
+
+ def _shuffle_by_index(self, src_loc, tmp_loc, item_type, size, index, count):
+ if index == 0 and count == 1:
+ return src_loc
+ select = 0
+ if item_type == FLOAT:
+ self.mc.MOVSS(tmp_loc, src_loc)
+ i = 0
+ while i < count:
+ select |= (index+i<<(i*2))
+ i += 1
+ self.mc.SHUFPS_xxi(tmp_loc.value, tmp_loc.value, select)
+ return tmp_loc
+ else:
+ py.test.set_trace()
+ raise NotImplementedError("shuffle by index for non floats")
+
+
+ def genop_vec_box_pack(self, op, arglocs, resloc):
+ toloc, fromloc, indexloc, sizeloc = arglocs
+ toarg = op.getarg(0)
+ index = indexloc.value
size = sizeloc.value
if size == 4:
- pass
+ select = 0
+ if index == 2:
+ select |= (1<<0)
+ select |= (2<<2)
+ select |= (3<<4)
+ select |= (4<<6)
+ else:
+ raise NotImplementedError("index is not equal to 2")
+
+ self.mc.SHUFPS_xxi(toloc.value, fromloc.value, select)
elif size == 8:
if indexloc.value == 0:
self.mc.UNPCKLPD(resloc, loc0)
else:
self.mc.UNPCKHPD(resloc, loc0)
- def genop_vec_expand(self, op, arglocs, resloc):
- loc0, countloc = arglocs
- count = countloc.value
- if count == 1:
- pass
- elif count == 2:
- self.mc.MOVDDUP(resloc, loc0)
+ def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
+ argloc, _ = arglocs
+ self.mc.CVTPD2PS(resloc, argloc)
- def genop_vec_int_signext(self, op, arglocs, resloc):
- pass
+ def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
+ loc0, tmploc, indexloc = arglocs
+ index = indexloc.value
+ if index == 0:
+ self.mc.CVTPS2PD(resloc, loc0)
+ else:
+ assert index == 2
+ self.mc.MOVSS_xx(tmploc.value, loc0.value)
+ select = (2<<0)|(3<<2) # move pos 2->0,3->1
+ self.mc.SHUFPS_xxi(tmploc.value, tmploc.value, select)
+ self.mc.CVTPS2PD(resloc, tmploc) # expand
# ________________________________________
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1542,7 +1542,7 @@
assert isinstance(index, ConstInt)
itemsize = self.assembler.cpu.vector_register_size // count.value
args = op.getarglist()
- loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0), args)
+ loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
self.perform(op, [loc0, loc1, imm(index.value), imm(itemsize)], None)
@@ -1551,11 +1551,13 @@
index = op.getarg(1)
assert isinstance(count, ConstInt)
assert isinstance(index, ConstInt)
- itemsize = self.assembler.cpu.vector_register_size // count.value
args = op.getarglist()
loc0 = self.xrm.make_sure_var_in_reg(op.getarg(0), args)
result = self.force_allocate_reg(op.result, args)
- self.perform(op, [loc0, imm(index.value), imm(itemsize)], result)
+ tmpxvar = TempBox()
+ tmploc = self.xrm.force_allocate_reg(tmpxvar)
+ self.xrm.possibly_free_var(tmpxvar)
+ self.perform(op, [loc0, tmploc, imm(index.value), imm(count.value)], result)
def consider_vec_expand(self, op):
count = op.getarg(1)
@@ -1564,13 +1566,6 @@
result = self.force_allocate_reg(op.result, args)
self.perform(op, [loc0, imm(count.value)], result)
- def consider_vec_cast_float_to_singlefloat(self, op):
- size = op.getarg(1)
- args = op.getarglist()
- loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
- result = self.force_allocate_reg(op.result, args)
- self.perform(op, [loc0, imm(size.value)], result)
-
def consider_vec_int_signext(self, op):
# there is not much we can do in this case. arithmetic is
# done on the vector register, if there is a wrap around,
@@ -1589,6 +1584,23 @@
def consider_guard_early_exit(self, op):
pass
+ def consider_vec_cast_float_to_singlefloat(self, op):
+ count = op.getarg(1)
+ args = op.getarglist()
+ loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
+ result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [loc0, imm(count.value)], result)
+
+ def consider_vec_cast_singlefloat_to_float(self, op):
+ index = op.getarg(1)
+ args = op.getarglist()
+ loc0 = self.make_sure_var_in_reg(op.getarg(0), args)
+ result = self.force_allocate_reg(op.result, args)
+ tmpxvar = TempBox()
+ tmploc = self.xrm.force_allocate_reg(tmpxvar)
+ self.xrm.possibly_free_var(tmpxvar)
+ self.perform(op, [loc0, tmploc, imm(index.value)], result)
+
# ________________________________________
def not_implemented_op(self, op):
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -641,6 +641,7 @@
LEA = _binaryop('LEA')
MOVSD = _binaryop('MOVSD')
+ MOVSS = _binaryop('MOVSS')
MOVAPD = _binaryop('MOVAPD')
MOVDQA = _binaryop('MOVDQA')
MOVDQU = _binaryop('MOVDQU')
@@ -654,6 +655,8 @@
CVTTSD2SI = _binaryop('CVTTSD2SI')
CVTSD2SS = _binaryop('CVTSD2SS')
CVTSS2SD = _binaryop('CVTSS2SD')
+ CVTPD2PS = _binaryop('CVTPD2PS')
+ CVTPS2PD = _binaryop('CVTPS2PD')
SQRTSD = _binaryop('SQRTSD')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -703,14 +703,13 @@
CVTTSD2SI_rx = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), register(2), '\xC0')
CVTTSD2SI_rb = xmminsn('\xF2', rex_w, '\x0F\x2C', register(1, 8), stack_bp(2))
- CVTSD2SS_xx = xmminsn('\xF2', rex_nw, '\x0F\x5A',
- register(1, 8), register(2), '\xC0')
- CVTSD2SS_xb = xmminsn('\xF2', rex_nw, '\x0F\x5A',
- register(1, 8), stack_bp(2))
- CVTSS2SD_xx = xmminsn('\xF3', rex_nw, '\x0F\x5A',
- register(1, 8), register(2), '\xC0')
- CVTSS2SD_xb = xmminsn('\xF3', rex_nw, '\x0F\x5A',
- register(1, 8), stack_bp(2))
+ CVTSD2SS_xx = xmminsn('\xF2', rex_nw, '\x0F\x5A', register(1, 8), register(2), '\xC0')
+ CVTSD2SS_xb = xmminsn('\xF2', rex_nw, '\x0F\x5A', register(1, 8), stack_bp(2))
+ CVTSS2SD_xx = xmminsn('\xF3', rex_nw, '\x0F\x5A', register(1, 8), register(2), '\xC0')
+ CVTSS2SD_xb = xmminsn('\xF3', rex_nw, '\x0F\x5A', register(1, 8), stack_bp(2))
+
+ CVTPD2PS_xx = xmminsn('\x66', rex_nw, '\x0F\x5A', register(1, 8), register(2), '\xC0')
+ CVTPS2PD_xx = xmminsn(rex_nw, '\x0F\x5A', register(1, 8), register(2), '\xC0')
# These work on machine sized registers, so "MOVDQ" is MOVD when running
# on 32 bits and MOVQ when running on 64 bits. "MOVD32" is always 32-bit.
@@ -731,12 +730,15 @@
MOVUPS_jx = xmminsn(rex_nw, '\x0F\x11', register(2, 8), abs_(1))
MOVUPS_ax = xmminsn(rex_nw, '\x0F\x11', register(2, 8), mem_reg_plus_scaled_reg_plus_const(1))
+ MOVSS_xx = xmminsn('\xF3', rex_nw, '\x0F\x10', register(1,8), register(2), '\xC0')
+
PSRLDQ_xi = xmminsn('\x66\x0F\x73', orbyte(0xd8), mem_reg_plus_const(1))
- UNPCKLPD_xx = xmminsn('\x66', rex_nw, '\x0F\x14', register(1, 8), register(2, 8), '\xC0')
- UNPCKHPD_xx = xmminsn('\x66', rex_nw, '\x0F\x15', register(1, 8), register(2, 8), '\xC0')
- UNPCKLPS_xx = xmminsn( rex_nw, '\x0F\x14', register(1, 8), register(2, 8), '\xC0')
- UNPCKHPS_xx = xmminsn( rex_nw, '\x0F\x15', register(1, 8), register(2, 8), '\xC0')
- MOVDDUP_xx = xmminsn('\xF2', rex_nw, '\x0F\x12', register(1, 8), register(2,8), '\xC0')
+ UNPCKLPD_xx = xmminsn('\x66', rex_nw, '\x0F\x14', register(1, 8), register(2), '\xC0')
+ UNPCKHPD_xx = xmminsn('\x66', rex_nw, '\x0F\x15', register(1, 8), register(2), '\xC0')
+ UNPCKLPS_xx = xmminsn( rex_nw, '\x0F\x14', register(1, 8), register(2), '\xC0')
+ UNPCKHPS_xx = xmminsn( rex_nw, '\x0F\x15', register(1, 8), register(2), '\xC0')
+ MOVDDUP_xx = xmminsn('\xF2', rex_nw, '\x0F\x12', register(1, 8), register(2), '\xC0')
+ SHUFPS_xxi = xmminsn(rex_nw, '\x0F\xC6', register(1,8), register(2), '\xC0', immediate(3, 'b'))
# SSE4.1 PEXTRDD_rxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x14', register(1,8), register(2), immediate(3,'b'))
# ------------------------------------------------------------
@@ -897,6 +899,8 @@
define_modrm_modes('MOVSD_x*', ['\xF2', rex_nw, '\x0F\x10', register(1,8)], regtype='XMM')
define_modrm_modes('MOVSD_*x', ['\xF2', rex_nw, '\x0F\x11', register(2,8)], regtype='XMM')
+define_modrm_modes('MOVSS_x*', ['\xF3', rex_nw, '\x0F\x10', register(1,8)], regtype='XMM')
+define_modrm_modes('MOVSS_*x', ['\xF3', rex_nw, '\x0F\x11', register(2,8)], regtype='XMM')
define_modrm_modes('MOVAPD_x*', ['\x66', rex_nw, '\x0F\x28', register(1,8)],
regtype='XMM')
define_modrm_modes('MOVAPD_*x', ['\x66', rex_nw, '\x0F\x29', register(2,8)],
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1137,14 +1137,66 @@
v18 = vec_getarrayitem_raw(p0, i5, 2, descr=floatarraydescr)
v19 = vec_cast_float_to_singlefloat(v17, 2)
v20 = vec_cast_float_to_singlefloat(v18, 2)
- v21 = vec_box(4)
- vec_box_pack(v21, v20, 2)
- vec_setarrayitem_raw(p1, i1, v21, 4, descr=singlefloatarraydescr)
+ vec_box_pack(v19, v20, 2)
+ vec_setarrayitem_raw(p1, i1, v19, 4, descr=singlefloatarraydescr)
jump(p0, p1, i7)
"""
vopt = self.vectorize(self.parse_loop(ops))
self.assert_equal(vopt.loop, self.parse_loop(opt))
+ def test_castup_arith_castdown(self):
+ ops = """
+ [p0,p1,p2,i0,i4]
+ guard_early_exit() []
+ i10 = raw_load(p0, i0, descr=singlefloatarraydescr)
+ i1 = int_add(i0, 4)
+ i11 = raw_load(p1, i1, descr=singlefloatarraydescr)
+ i2 = int_add(i1, 4)
+ f1 = cast_singlefloat_to_float(i10)
+ f2 = cast_singlefloat_to_float(i11)
+ f3 = float_add(f1, f2)
+ i12 = cast_float_to_singlefloat(f3)
+ raw_store(p2, i4, i12, descr=singlefloatarraydescr)
+ i5 = int_add(i4, 4)
+ i186 = int_lt(i5, 100)
+ guard_false(i186) []
+ jump(p0,p1,p2,i2,i5)
+ """
+ opt = """
+ [p0, p1, p2, i0, i4]
+ guard_early_exit() []
+ i5 = int_add(i4, 4)
+ i1 = int_add(i0, 4)
+ i186 = int_lt(i5, 100)
+ i2 = int_add(i0, 8)
+ i187 = int_add(i4, 8)
+ i191 = int_add(i0, 12)
+ i190 = int_lt(i187, 100)
+ i192 = int_add(i0, 16)
+ i188 = int_add(i4, 12)
+ i200 = int_add(i0, 20)
+ i199 = int_lt(i188, 100)
+ i201 = int_add(i0, 24)
+ i189 = int_add(i4, 16)
+ i209 = int_add(i0, 28)
+ i208 = int_lt(i189, 100)
+ guard_false(i208) []
+ i210 = int_add(i0, 32)
+ v217 = vec_raw_load(p0, i0, 4, descr=singlefloatarraydescr)
+ v218 = vec_cast_singlefloat_to_float(v217, 0, 2)
+ v219 = vec_cast_singlefloat_to_float(v217, 2, 2)
+ v220 = vec_raw_load(p1, i1, 4, descr=singlefloatarraydescr)
+ v221 = vec_cast_singlefloat_to_float(v220, 0, 2)
+ v222 = vec_cast_singlefloat_to_float(v220, 2, 2)
+ v223 = vec_float_add(v218, v221, 2)
+ v224 = vec_float_add(v219, v222, 2)
+ v225 = vec_cast_float_to_singlefloat(v223, 2)
+ v226 = vec_cast_float_to_singlefloat(v224, 2)
+ vec_raw_store(p2, i4, v225, 4, descr=singlefloatarraydescr)
+ jump(p0, p1, p2, i210, i189)
+ """
+ vopt = self.vectorize(self.parse_loop(ops))
+ self.assert_equal(vopt.loop, self.parse_loop(opt))
class TestLLtype(BaseTestVectorize, LLtypeMixin):
pass
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -402,7 +402,7 @@
if vbox:
arg_cloned = arg.clonebox()
cj = ConstInt(j)
- ci = ConstInt(vbox.item_count)
+ ci = ConstInt(1)
unpack_op = ResOperation(rop.VEC_BOX_UNPACK, [vbox, cj, ci], arg_cloned)
self.emit_operation(unpack_op)
sched_data.rename_unpacked(arg, arg_cloned)
@@ -552,19 +552,64 @@
# this might be an indicator for edge removal
return True
+class PackType(PrimitiveTypeMixin):
+ UNKNOWN_TYPE = '-'
+
+ def __init__(self, type, size, signed):
+ self.type = type
+ self.size = size
+ self.signed = signed
+
+ def gettype(self):
+ return self.type
+
+ def getsize(self):
+ return self.size
+
+ def getsigned(self):
+ return self.signed
+
+ def get_byte_size(self):
+ return self.size
+
+ @staticmethod
+ def by_descr(descr):
+ _t = INT
+ if descr.is_array_of_floats() or descr.loaded_float:
+ _t = FLOAT
+ pt = PackType(_t, descr.get_item_size_in_bytes(), descr.is_item_signed())
+ return pt
+
+ def record_vbox(self, vbox):
+ if self.type == PackType.UNKNOWN_TYPE:
+ self.type = vbox.type
+ self.signed = vbox.signed
+ if vbox.item_size > self.size:
+ self.size = vbox.item_size
+
+ def __repr__(self):
+ return 'PackType(%s, %s, %s)' % (self.type, self.size, self.signed)
+
+ def clone(self):
+ return PackType(self.type, self.size, self.signed)
+
+
class PackArgs(object):
- def __init__(self, arg_pos, result=True):
+ def __init__(self, arg_pos, result_type=None, result=True, index=-1):
self.mask = 0
+ self.result_type = result_type
+ self.result = result
+ self.index = index
for p in arg_pos:
- self.mask |= (1<<(p+1))
- if result:
- self.mask |= 1
+ self.mask |= (1<<p)
- def arg_is_set(self, i):
- return bool((1<<(i+1)) & self.mask)
+ def getpacktype(self):
+ if self.result_type is not None:
+ return self.result_type.clone()
+ return PackType(PackType.UNKNOWN_TYPE, 0, True)
- def result_is_set(self):
- return bool(1 & self.mask)
+ def vector_arg(self, i):
+ return bool((1<<(i)) & self.mask)
ROP_ARG_RES_VECTOR = {
@@ -576,14 +621,17 @@
rop.VEC_FLOAT_ADD: PackArgs((0,1)),
rop.VEC_FLOAT_SUB: PackArgs((0,1)),
rop.VEC_FLOAT_MUL: PackArgs((0,1)),
- rop.VEC_FLOAT_EQ: PackArgs((0,1)),
+ rop.VEC_FLOAT_EQ: PackArgs((0,1), result_type=PackType(INT, -1, True)),
rop.VEC_RAW_LOAD: PackArgs(()),
rop.VEC_GETARRAYITEM_RAW: PackArgs(()),
rop.VEC_RAW_STORE: PackArgs((2,), result=False),
rop.VEC_SETARRAYITEM_RAW: PackArgs((2,), result=False),
- rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: PackArgs((0,)),
+ rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: PackArgs((0,), result_type=PackType(FLOAT, 4, True)),
+ rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: PackArgs((0,), result_type=PackType(FLOAT, 8, True), index=1),
+ rop.VEC_CAST_FLOAT_TO_INT: PackArgs((0,), result_type=PackType(INT, 8, True)),
+ rop.VEC_CAST_INT_TO_FLOAT: PackArgs((0,), result_type=PackType(FLOAT, 8, True)),
}
@@ -632,32 +680,40 @@
op0 = self.pack.operations[self.pack_off].getoperation()
assert op0.vector != -1
args = op0.getarglist()[:]
- args.append(ConstInt(self.pack_ops))
- vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
packargs = ROP_ARG_RES_VECTOR.get(op0.vector, None)
if packargs is None:
raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
+ if packargs.index != -1:
+ args.append(ConstInt(self.pack_off))
+
+ args.append(ConstInt(self.pack_ops))
+ vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
+
for i,arg in enumerate(args):
- if packargs.arg_is_set(i):
+ if packargs.vector_arg(i):
self.vector_arg(vop, i, True)
- if packargs.result_is_set():
- self.vector_result(vop)
+ if packargs.result:
+ self.vector_result(vop, packargs)
self.preamble_ops.append(vop)
def propagete_ptype(self):
- op0 = self.pack.operations[self.pack_off].getoperation()
+ op0 = self.pack.operations[0].getoperation()
packargs = ROP_ARG_RES_VECTOR.get(op0.vector, None)
if packargs is None:
raise NotImplementedError("vecop map entry missing. trans: pack -> vop")
args = op0.getarglist()[:]
- ptype = PackType(PackType.UNKNOWN_TYPE, 0, True)
+ ptype = packargs.getpacktype()
for i,arg in enumerate(args):
- if packargs.arg_is_set(i):
+ if packargs.vector_arg(i):
vbox = self.get_vbox_for(arg)
- ptype.record_vbox(vbox)
+ if vbox is not None:
+ ptype.record_vbox(vbox)
+ else:
+ ptype.size = arg
+ raise NotImplementedError
self.pack.ptype = ptype
@@ -668,10 +724,17 @@
except KeyError:
return None
- def vector_result(self, vop):
+ def vector_result(self, vop, packargs):
ops = self.pack.operations
result = vop.result
- vop.result = vbox = self.box_vector(self.pack.ptype)
+ if packargs.result_type is not None:
+ ptype = packargs.getpacktype()
+ if ptype.size == -1:
+ ptype.size = self.pack.ptype.size
+ vbox = self.box_vector(ptype)
+ else:
+ vbox = self.box_vector(self.pack.ptype)
+ vop.result = vbox
i = self.pack_off
end = i + self.pack_ops
while i < end:
@@ -692,28 +755,28 @@
assert False, "not allowed to expand" \
", but do not have a vector box as arg"
# vbox is a primitive type mixin
- if self.pack.ptype.getsize() < vbox.getsize():
- packable = self.vec_reg_size // self.pack.ptype.getsize()
- packed = vbox.item_count
- vbox = self.pack_arguments(packed, [op.getoperation().getarg(argidx) for op in ops])
+ packable = self.vec_reg_size // self.pack.ptype.getsize()
+ packed = vbox.item_count
+ if packed < packable:
+ args = [op.getoperation().getarg(argidx) for op in ops]
+ self.package(vbox, packed, args)
vop.setarg(argidx, vbox)
return vbox
- def pack_arguments(self, index, args):
+ def package(self, tgt_box, index, args):
+ arg_count = len(args)
i = index
- vbox = self.box_vector(self.pack.ptype)
- op = ResOperation(rop.VEC_BOX, [ConstInt(len(args))], vbox)
- self.preamble_ops.append(op)
- arg_count = len(args)
while i < arg_count:
arg = args[i]
- vbox2 = self.get_vbox_for(arg)
- if vbox2 is None:
- raise NotImplementedError
- op = ResOperation(rop.VEC_BOX_PACK, [vbox, vbox2, ConstInt(i)], None)
+ pos, src_box = self.box_to_vbox.get(arg, (-1, None))
+ if pos != 0:
+ i += 1
+ continue
+ op = ResOperation(rop.VEC_BOX_PACK,
+ [tgt_box, src_box, ConstInt(i),
+ ConstInt(src_box.item_count)], None)
self.preamble_ops.append(op)
- i += vbox.item_count
- return vbox
+ i += 1
def expand_box_to_vector_box(self, vop, argidx):
arg = vop.getarg(argidx)
@@ -751,44 +814,6 @@
return True
return False
-class PackType(PrimitiveTypeMixin):
- UNKNOWN_TYPE = '-'
-
- def __init__(self, type, size, signed):
- self.type = type
- self.size = size
- self.signed = signed
-
- def gettype(self):
- return self.type
-
- def getsize(self):
- return self.size
-
- def getsigned(self):
- return self.signed
-
- def get_byte_size(self):
- return self.size
-
- @staticmethod
- def by_descr(descr):
- _t = INT
- if descr.is_array_of_floats():
- _t = FLOAT
- pt = PackType(_t, descr.get_item_size_in_bytes(), descr.is_item_signed())
- return pt
-
- def record_vbox(self, vbox):
- if self.type == PackType.UNKNOWN_TYPE:
- self.type = vbox.type
- self.signed = vbox.signed
- if vbox.item_size > self.size:
- self.size = vbox.item_size
-
- def __repr__(self):
- return 'PackType(%s, %s, %s)' % (self.type, self.size, self.signed)
-
class PackSet(object):
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -464,7 +464,12 @@
'VEC_FLOAT_EQ/3',
'VEC_INT_SIGNEXT/3',
+ # double -> float: v2 = cast(v1, 2) equal to v2 = (v1[0], v1[1], X, X)
'VEC_CAST_FLOAT_TO_SINGLEFLOAT/2',
+ # v4 = cast(v3, 0, 2), v4 = (v3[0], v3[1])
+ 'VEC_CAST_SINGLEFLOAT_TO_FLOAT/3',
+ 'VEC_CAST_FLOAT_TO_INT/2',
+ 'VEC_CAST_INT_TO_FLOAT/2',
'VEC_BOX_UNPACK/3', # iX|fX = VEC_BOX_UNPACK(vX, index, item_count)
'VEC_BOX_PACK/4', # VEC_BOX_PACK(vX, var/const, index, item_count)
@@ -734,7 +739,11 @@
rop.FLOAT_EQ: rop.VEC_FLOAT_EQ,
rop.INT_SIGNEXT: rop.VEC_INT_SIGNEXT,
+
rop.CAST_FLOAT_TO_SINGLEFLOAT: rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT,
+ rop.CAST_SINGLEFLOAT_TO_FLOAT: rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT,
+ rop.CAST_INT_TO_FLOAT: rop.VEC_CAST_INT_TO_FLOAT,
+ rop.CAST_FLOAT_TO_INT: rop.VEC_CAST_FLOAT_TO_INT,
}
def setup2():
More information about the pypy-commit
mailing list