[pypy-commit] pypy vecopt: added guard_true/false for vector register as first argument
plan_rich
noreply at buildbot.pypy.org
Sat Jun 27 18:09:51 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r78330:9150ce6cdf52
Date: 2015-06-27 18:10 +0200
http://bitbucket.org/pypy/pypy/changeset/9150ce6cdf52/
Log: added guard_true/false for vector register as first argument
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -276,7 +276,11 @@
def test_int8_expand(self):
result = self.run("int8_expand")
assert int(result) == 17*8 + sum(range(0,17))
- self.check_vectorized(3, 1) # TODO sum at the end
+ # does not pay off to cast float64 -> int8
+ # neither does sum
+ # a + c should work, but it is given as a parameter
+ # thus the accum must handle this!
+ self.check_vectorized(3, 0) # TODO
def define_int32_add_const():
return """
@@ -535,25 +539,109 @@
def define_any():
return """
+ a = astype([0,0,0,0,0,0,0,1,0,0,0],int8)
+ any(a)
+ """
+
+ def define_any_int():
+ return """
+ a = astype([0,0,0,0,256,65537,0,0,0,0,0],int16)
+ any(a)
+ """
+
+ def define_any_ret_0():
+ return """
+ a = astype([0,0,0,0,0,0,0,0,0,0,0],int64)
+ any(a)
+ """
+
+ def define_float_any():
+ return """
a = [0,0,0,0,0,0,0,1,0,0,0]
any(a)
"""
+ def define_float32_any():
+ return """
+ a = astype([0,0,0,0,0,0,0,1,0,0,0], float32)
+ any(a)
+ """
+
+ def test_float_any(self):
+ result = self.run("float_any")
+ assert int(result) == 1
+ self.check_vectorized(2, 2)
+
+ def test_float32_any(self):
+ result = self.run("float32_any")
+ assert int(result) == 1
+ self.check_vectorized(1, 1)
+
def test_any(self):
- result = self.run("any")
- assert result == 1
- self.check_vectorized(1, 0)
+ result = self.run("float_any")
+ assert int(result) == 1
+ self.check_vectorized(1, 1)
+
+ def test_any_int(self):
+ result = self.run("any_int")
+ assert int(result) == 1
+ self.check_vectorized(2, 1)
+
+ def test_any_ret_0(self):
+ result = self.run("any_ret_0")
+ assert int(result) == 0
+ self.check_vectorized(2, 2)
def define_all():
return """
+ a = astype([1,1,1,1,1,1,1,1],int32)
+ all(a)
+ """
+ def define_all_int():
+ return """
+ a = astype([1,100,255,1,3,1,1,1],int32)
+ all(a)
+ """
+ def define_all_ret_0():
+ return """
+ a = astype([1,1,1,1,1,0,1,1],int32)
+ all(a)
+ """
+ def define_float_all():
+ return """
a = [1,1,1,1,1,1,1,1]
all(a)
"""
+ def define_float32_all():
+ return """
+ a = astype([1,1,1,1,1,1,1,1],float32)
+ all(a)
+ """
+
+ def test_float_all(self):
+ result = self.run("float_all")
+ assert int(result) == 1
+ self.check_vectorized(2, 2)
+
+ def test_float_all(self):
+ result = self.run("float32_all")
+ assert int(result) == 1
+ self.check_vectorized(2, 2)
def test_all(self):
result = self.run("all")
- assert result == 1
- self.check_vectorized(1, 1)
+ assert int(result) == 1
+ self.check_vectorized(2, 2)
+
+ def test_all_int(self):
+ result = self.run("all_int")
+ assert int(result) == 1
+ self.check_vectorized(2, 2)
+
+ def test_all_ret_0(self):
+ result = self.run("all_ret_0")
+ assert int(result) == 0
+ self.check_vectorized(2, 2)
def define_logical_xor_reduce():
return """
diff --git a/rpython/doc/jit/vectorization.rst b/rpython/doc/jit/vectorization.rst
--- a/rpython/doc/jit/vectorization.rst
+++ b/rpython/doc/jit/vectorization.rst
@@ -54,5 +54,8 @@
The opcode needed spans over multiple instructions. In terms of performance
there might only be little to non advantage to use SIMD instructions for this
conversions.
+* For a guard that checks true/false on a vector integer regsiter, it would be handy
+ to have 2 xmm registers (one filled with zero bits and the other with one every bit).
+ This cuts down 2 instructions for guard checking, trading for higher register pressure.
.. _PMUL: http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -1644,10 +1644,38 @@
self.mc.MOVD32_xr(resloc.value, eax.value)
self.mc.PUNPCKLDQ_xx(resloc.value, loc1.value)
+ def genop_guard_vector_arg(self, guard_op, loc):
+ arg = guard_op.getarg(0)
+ assert isinstance(arg, BoxVector)
+ size = arg.item_size
+ temp = X86_64_XMM_SCRATCH_REG
+ #
+ self.mc.PXOR(temp, temp)
+ # if the vector is not fully packed blend 1s
+ if not arg.fully_packed(self.cpu.vector_register_size):
+ self.mc.PCMPEQQ(temp, temp) # fill with ones
+ select = 0
+ bits_used = (arg.item_count * arg.item_size * 8)
+ index = bits_used // 16
+ while index < 8:
+ select |= (1 << index)
+ index += 1
+ self.mc.PBLENDW_xxi(loc, temp, select)
+ # reset to zeros
+ self.mc.PXOR(temp, temp)
+
+ self.mc.PCMPEQ(size, loc, temp)
+ self.mc.PCMPEQQ(temp, temp)
+ self.mc.PTEST(loc, temp)
+
def genop_guard_guard_true(self, ign_1, guard_op, guard_token, locs, ign_2):
loc = locs[0]
- self.mc.TEST(loc, loc)
- self.implement_guard(guard_token, 'Z')
+ if loc.is_xmm:
+ self.genop_guard_vector_arg(guard_op, loc)
+ self.implement_guard(guard_token, 'Z')
+ else:
+ self.mc.TEST(loc, loc)
+ self.implement_guard(guard_token, 'Z')
genop_guard_guard_nonnull = genop_guard_guard_true
def genop_guard_guard_no_exception(self, ign_1, guard_op, guard_token,
@@ -1724,8 +1752,12 @@
def genop_guard_guard_false(self, ign_1, guard_op, guard_token, locs, ign_2):
loc = locs[0]
- self.mc.TEST(loc, loc)
- self.implement_guard(guard_token, 'NZ')
+ if loc.is_xmm:
+ self.genop_guard_vector_arg(guard_op, loc)
+ self.implement_guard(guard_token, 'Z')
+ else:
+ self.mc.TEST(loc, loc)
+ self.implement_guard(guard_token, 'NZ')
genop_guard_guard_isnull = genop_guard_guard_false
def genop_guard_guard_value(self, ign_1, guard_op, guard_token, locs, ign_2):
@@ -2723,7 +2755,7 @@
def genop_vec_int_expand(self, op, arglocs, resloc):
srcloc, sizeloc = arglocs
if not isinstance(srcloc, RegLoc):
- self.mov(X86_64_SCRATCH_REG, srcloc)
+ self.mov(srcloc, X86_64_SCRATCH_REG)
srcloc = X86_64_SCRATCH_REG
assert not srcloc.is_xmm
size = sizeloc.value
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -390,12 +390,22 @@
return self.xrm.loc(v)
return self.rm.loc(v)
+ def _consider_guard_tf(self, op):
+ arg = op.getarg(0)
+ if arg.type == VECTOR:
+ assert arg.item_type == INT
+ loc = self.xrm.make_sure_var_in_reg(arg)
+ else:
+ loc = self.rm.make_sure_var_in_reg(arg)
+ self.perform_guard(op, [loc], None)
+
+ consider_guard_true = _consider_guard_tf
+ consider_guard_false = _consider_guard_tf
+
def _consider_guard(self, op):
loc = self.rm.make_sure_var_in_reg(op.getarg(0))
self.perform_guard(op, [loc], None)
- consider_guard_true = _consider_guard
- consider_guard_false = _consider_guard
consider_guard_nonnull = _consider_guard
consider_guard_isnull = _consider_guard
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -601,6 +601,28 @@
self._reuse_scratch_register = False
self._scratch_register_known = False
+ def _vector_size_choose(name):
+ def invoke(self, suffix, val1, val2):
+ methname = name + suffix
+ _rx86_getattr(self, methname)(val1, val2)
+ invoke._annspecialcase_ = 'specialize:arg(1)'
+
+ def INSN(self, size, loc1, loc2):
+ code1 = loc1.location_code()
+ code2 = loc2.location_code()
+ val1 = getattr(loc1, "value_" + code1)()
+ val2 = getattr(loc2, "value_" + code2)()
+ suffix = 'B'
+ if size == 2:
+ suffix = 'W'
+ elif size == 4:
+ suffix = 'D'
+ else:
+ suffix = 'Q'
+ invoke(self, suffix + "_"+ code1+code2, val1, val2)
+
+ return INSN
+
AND = _binaryop('AND')
OR = _binaryop('OR')
OR8 = _binaryop('OR8')
@@ -610,6 +632,7 @@
SHR = _binaryop('SHR')
SAR = _binaryop('SAR')
TEST = _binaryop('TEST')
+ PTEST = _binaryop('PTEST')
TEST8 = _binaryop('TEST8')
BTS = _binaryop('BTS')
@@ -621,6 +644,11 @@
CMP = _binaryop('CMP')
CMP16 = _binaryop('CMP16')
+ PCMPEQQ = _binaryop('PCMPEQQ')
+ PCMPEQD = _binaryop('PCMPEQD')
+ PCMPEQW = _binaryop('PCMPEQW')
+ PCMPEQB = _binaryop('PCMPEQB')
+ PCMPEQ = _vector_size_choose('PCMPEQ')
MOV = _binaryop('MOV')
MOV8 = _binaryop('MOV8')
MOV16 = _binaryop('MOV16')
@@ -698,7 +726,6 @@
PAND = _binaryop('PAND')
POR = _binaryop('POR')
PXOR = _binaryop('PXOR')
- PCMPEQD = _binaryop('PCMPEQD')
PSRLDQ = _binaryop('PSRLDQ')
MOVDQ = _binaryop('MOVDQ')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -766,6 +766,8 @@
PINSRB_xri = xmminsn('\x66', rex_nw, '\x0F\x3A\x20', register(1,8), register(2), '\xC0', immediate(3, 'b'))
INSERTPS_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x21', register(1,8), register(2), '\xC0', immediate(3, 'b'))
+ PTEST_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x17', register(1,8), register(2), '\xC0')
+ PBLENDW_xxi = xmminsn('\x66', rex_nw, '\x0F\x3A\x0E', register(1,8), register(2), '\xC0', immediate(3, 'b'))
# ------------------------------------------------------------
@@ -1003,7 +1005,10 @@
define_pxmm_insn('PUNPCKHDQ_x*', '\x6A')
define_pxmm_insn('PUNPCKLQDQ_x*', '\x6C')
define_pxmm_insn('PUNPCKHQDQ_x*', '\x6D')
+define_pxmm_insn('PCMPEQQ_x*', '\x38\x29')
define_pxmm_insn('PCMPEQD_x*', '\x76')
+define_pxmm_insn('PCMPEQW_x*', '\x75')
+define_pxmm_insn('PCMPEQB_x*', '\x74')
# ____________________________________________________________
diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -540,6 +540,9 @@
def getcount(self):
return self.item_count
+ def fully_packed(self, vec_reg_size):
+ return self.item_size * self.item_count == vec_reg_size
+
def forget_value(self):
raise NotImplementedError("cannot forget value of vector")
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -367,7 +367,7 @@
# the argument has more items than the operation is able to process!
# box_pos == 0 then it is already at the right place
if box_pos != 0:
- args[i] = self.unpack(vbox, box_pos, packable, self.input_type)
+ args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type)
self.update_arg_in_vector_pos(i, args[i])
#self.update_input_output(self.pack)
continue
@@ -384,7 +384,7 @@
if box_pos != 0:
# The vector box is at a position != 0 but it
# is required to be at position 0. Unpack it!
- args[i] = self.unpack(vbox, box_pos, packable, self.input_type)
+ args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type)
self.update_arg_in_vector_pos(i, args[i])
continue
#self.update_input_output(self.pack)
@@ -450,6 +450,7 @@
def unpack(self, vbox, index, count, arg_ptype):
assert index < vbox.item_count
assert index + count <= vbox.item_count
+ assert count > 0
vbox_cloned = vectorbox_clone_set(vbox, count=count)
opnum = getunpackopnum(vbox.item_type)
op = ResOperation(opnum, [vbox, ConstInt(index), ConstInt(count)], vbox_cloned)
@@ -787,7 +788,6 @@
def setvector_of_box(self, box, off, vector):
assert off < vector.item_count
- print "set" , box, "[",off,"] =", vector
self.box_to_vbox[box] = (off, vector)
def prepend_invariant_operations(self, oplist):
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1357,40 +1357,23 @@
def test_abc(self):
trace="""
- label(p0, p1, p5, p6, p7, p17, p19, i53, i39, i44, i49, i51, descr=TargetToken(140531585719072))
- guard_not_invalidated(descr=<Guard0x7fd00f3ebdb0>) [p1, p0, p5, p6, p7, p17, p19]
- i63 = int_ge(i53, 2024)
- guard_false(i63, descr=<Guard0x7fd00f3ebe08>) [p1, p0, p5, p6, p7, p17, p19, i53]
- i64 = int_lt(i53, i39)
- guard_true(i64, descr=<Guard0x7fd00f3ebe60>) [p1, p0, i53, p5, p6, p7, p17, p19, None]
- f65 = getarrayitem_raw(i44, i53, descr=floatarraydescr)
- f66 = float_add(f65, 1.000000)
- i67 = int_lt(i53, i49)
- guard_true(i67, descr=<Guard0x7fd00f3ebeb8>) [p1, p0, i53, p5, p6, p7, p17, p19, f66, None]
- setarrayitem_raw(i51, i53, f66, descr=floatarraydescr)
- i68 = int_add(i53, 1)
- i69 = getfield_raw(140531584083072, descr=<FieldS pypysig_long_struct.c_value 0>)
- setfield_gc(59, i68, descr=<FieldS pypy.objspace.std.typeobject.IntMutableCell.inst_intvalue 8>)
- i70 = int_lt(i69, 0)
- guard_false(i70, descr=<Guard0x7fd00f3ebf10>) [p1, p0, p5, p6, p7, p17, p19, None, None]
- jump(p0, p1, p5, p6, p7, p17, p19, i68, i39, i44, i49, i51)
- """
- trace="""
- [p0, p1, p9, i10, p4, i11, p3, p6, p12, i13, i14, i15, f16, i17, i18]
- guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f2327d4b390>) [p6, p4, p3, p1, p0, i14, i10, i13, i11, p9, p12]
- i19 = raw_load(i15, i11, descr=singlefloatarraydescr)
- guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f23284786d0>) [p6, p4, p3, p1, p0, i19, i14, i10, i13, i11, p9, p12]
- i21 = int_add(i11, 4)
- f22 = cast_singlefloat_to_float(i19)
- f23 = float_add(f22, f16)
- i24 = cast_float_to_singlefloat(f23)
- raw_store(i17, i14, i24, descr=singlefloatarraydescr)
- i26 = int_add(i13, 1)
- i28 = int_add(i14, 4)
- i29 = int_ge(i26, i18)
- guard_false(i29, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f2327d53910>) [p6, p4, p3, p1, p0, i28, i21, i26, None, i10, None, None, p9, p12]
- debug_merge_point(0, 0, '(numpy_call2: no get_printable_location)')
- jump(p0, p1, p9, i10, p4, i21, p3, p6, p12, i26, i28, i15, f16, i17, i18)
+ [p0, p9, i10, p3, i11, p12, i13, p6, i14, p7, p15, i16, i17, i18, i19, i20, i21]
+ guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f09b34aad50>) [p7, p6, p3, p0, i14, i17, i16, p9, p15, i11, i10, p12, i13]
+ i22 = raw_load(i18, i11, descr=singlefloatarraydescr)
+ guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f09b34fd390>) [p7, p6, p3, p0, i22, i14, i17, i16, p9, p15, i11, i10, p12, i13]
+ i24 = int_add(i11, 4)
+ i25 = raw_load(i19, i17, descr=singlefloatarraydescr)
+ i27 = int_add(i17, 4)
+ f28 = cast_singlefloat_to_float(i22)
+ f29 = cast_singlefloat_to_float(i25)
+ f30 = float_add(f28, f29)
+ i31 = cast_float_to_singlefloat(f30)
+ raw_store(i20, i14, i31, descr=singlefloatarraydescr)
+ i33 = int_add(i13, 1)
+ i35 = int_add(i14, 4)
+ i36 = int_ge(i33, i21)
+ guard_false(i36, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f09b34b7c10>) [p7, p6, p3, p0, i35, i24, i33, i27, None, None, i16, p9, p15, None, i10, p12, None]
+ jump(p0, p9, i10, p3, i24, p12, i33, p6, i35, p7, p15, i16, i27, i18, i19, i20, i21)
"""
opt = self.vectorize(self.parse_loop(trace))
self.debug_print_operations(opt.loop)
More information about the pypy-commit
mailing list