[pypy-commit] pypy vecopt: don't follow wrong dependency chains, excluded fail args
plan_rich
noreply at buildbot.pypy.org
Fri Jun 26 09:56:35 CEST 2015
Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r78318:8afb499c0584
Date: 2015-06-26 09:56 +0200
http://bitbucket.org/pypy/pypy/changeset/8afb499c0584/
Log: don't follow wrong dependency chains, excluded fail args only store
is not allowed to compute operations if the vector is not fully
packed
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2722,7 +2722,9 @@
def genop_vec_int_expand(self, op, arglocs, resloc):
srcloc, sizeloc = arglocs
- assert isinstance(srcloc, RegLoc)
+ if not isinstance(srcloc, RegLoc):
+ self.mov(X86_64_SCRATCH_REG, srcloc)
+ srcloc = X86_64_SCRATCH_REG
assert not srcloc.is_xmm
size = sizeloc.value
if size == 1:
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -269,8 +269,10 @@
left = len(pack.operations)
assert stride > 0
while off < len(pack.operations):
- if left < stride:
- self.preamble_ops.append(pack.operations[off].getoperation())
+ print left, "<", stride
+ if stride == 1:
+ op = pack.operations[off].getoperation()
+ self.preamble_ops.append(op)
off += 1
continue
ops = pack.operations[off:off+stride]
@@ -294,9 +296,6 @@
if bytes > vec_reg_size:
# too many bytes. does not fit into the vector register
return vec_reg_size // self.getscalarsize()
- if bytes < vec_reg_size:
- # not enough to fill the vector register
- return 1
return pack.opcount()
def getscalarsize(self):
@@ -316,12 +315,16 @@
if isinstance(arg, BoxVector):
continue
if self.is_vector_arg(i):
- args[i] = self.transform_argument(args[i], i, off)
+ args[i] = self.transform_argument(args[i], i, off, stride)
#
result = op.result
result = self.transform_result(result, off)
#
vop = ResOperation(op.vector, args, result, op.getdescr())
+ if op.is_guard():
+ assert isinstance(op, GuardResOp)
+ vop.setfailargs(op.getfailargs())
+ vop.rd_snapshot = op.rd_snapshot
self.preamble_ops.append(vop)
def transform_result(self, result, off):
@@ -342,7 +345,7 @@
signed = self.output_type.signed
return BoxVector(type, count, size, signed)
- def transform_argument(self, arg, argidx, off):
+ def transform_argument(self, arg, argidx, off, stride):
ops = self.pack.operations
box_pos, vbox = self.sched_data.getvector_of_box(arg)
if not vbox:
@@ -359,7 +362,8 @@
packed = vbox.item_count
assert packed >= 0
assert packable >= 0
- if packed < packable:
+ vboxes = self.vector_boxes_for_args(argidx)
+ if len(vboxes) > 1: # packed < packable and packed < stride:
# the argument is scattered along different vector boxes
args = [op.getoperation().getarg(argidx) for op in ops]
vbox = self._pack(vbox, packed, args, packable)
@@ -379,8 +383,20 @@
vbox = self.unpack(vbox, args, off, len(ops), self.input_type)
self.update_input_output(self.pack)
#
+ assert vbox is not None
return vbox
+ def vector_boxes_for_args(self, index):
+ args = [op.getoperation().getarg(index) for op in self.pack.operations]
+ vboxes = []
+ last_vbox = None
+ for arg in args:
+ pos, vbox = self.sched_data.getvector_of_box(arg)
+ if vbox != last_vbox and vbox is not None:
+ vboxes.append(vbox)
+ return vboxes
+
+
def extend(self, vbox, newtype):
assert vbox.gettype() == newtype.gettype()
if vbox.gettype() == INT:
@@ -443,6 +459,7 @@
self.sched_data.setvector_of_box(arg, j, new_box)
tgt_box = new_box
_, vbox = self.sched_data.getvector_of_box(args[0])
+ assert vbox is not None
return vbox
def _check_vec_pack(self, op):
@@ -589,6 +606,11 @@
return BoxVector(type, count, size, signed)
class StoreToVectorStore(OpToVectorOp):
+ """
+ Storing operations are special because they are not allowed
+ to store to memory if the vector is not fully filled.
+ Thus a modified split_pack function
+ """
def __init__(self):
OpToVectorOp.__init__(self, (None, None, PT_GENERIC), None)
self.has_descr = True
@@ -599,6 +621,20 @@
def determine_output_type(self, op):
return None
+ def split_pack(self, pack, vec_reg_size):
+ """ Returns how many items of the pack should be
+ emitted as vector operation. """
+ bytes = pack.opcount() * self.getscalarsize()
+ if bytes > vec_reg_size:
+ # too many bytes. does not fit into the vector register
+ return vec_reg_size // self.getscalarsize()
+ if bytes < vec_reg_size:
+ # special case for store, even though load is allowed
+ # to load more, store is not!
+ # not enough to fill the vector register
+ return 1
+ return pack.opcount()
+
class PassThroughOp(OpToVectorOp):
""" This pass through is only applicable if the target
operation is capable of handling vector operations.
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
@@ -79,6 +79,16 @@
for op in vsd.as_vector_operation(pack, renamer):
ops.append(op)
loop.operations = ops
+ metainterp_sd = FakeMetaInterpStaticData(self.cpu)
+ jitdriver_sd = FakeJitDriverStaticData()
+ opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 0)
+ opt.clear_newoperations()
+ for op in ops:
+ opt.unpack_from_vector(op, vsd, renamer)
+ opt.emit_operation(op)
+ ops = opt._newoperations
+ loop.operations = ops
+
if prepend_invariant:
loop.operations = vsd.invariant_oplist + ops
return loop
@@ -100,8 +110,7 @@
loop2 = self.schedule(loop1, [pack1])
loop3 = self.parse("""
v10[i32|4] = vec_raw_load(p0, i0, 4, descr=float)
- i14 = raw_load(p0, i4, descr=float)
- i15 = raw_load(p0, i5, descr=float)
+ v11[i32|2] = vec_raw_load(p0, i4, 2, descr=float)
""", False)
self.assert_equal(loop2, loop3)
@@ -109,12 +118,15 @@
loop1 = self.parse("""
i10 = raw_load(p0, i0, descr=long)
i11 = raw_load(p0, i1, descr=long)
- f10 = cast_int_to_float(i10)
- f11 = cast_int_to_float(i11)
+ i12 = int_signext(i10, 4)
+ i13 = int_signext(i11, 4)
+ f10 = cast_int_to_float(i12)
+ f11 = cast_int_to_float(i13)
""")
pack1 = self.pack(loop1, 0, 2)
pack2 = self.pack(loop1, 2, 4)
- loop2 = self.schedule(loop1, [pack1, pack2])
+ pack3 = self.pack(loop1, 4, 6)
+ loop2 = self.schedule(loop1, [pack1, pack2, pack3])
loop3 = self.parse("""
v10[i64|2] = vec_raw_load(p0, i0, 2, descr=long)
v20[i32|2] = vec_int_signext(v10[i64|2], 4)
@@ -321,3 +333,54 @@
guard_true(v11[i64|2]) []
""", False)
self.assert_equal(loop2, loop3)
+
+
+ def test_split_load_store(self):
+ loop1 = self.parse("""
+ i10 = raw_load(p0, i1, descr=float)
+ i11 = raw_load(p0, i2, descr=float)
+ raw_store(p0, i3, i10, descr=float)
+ raw_store(p0, i4, i11, descr=float)
+ """)
+ pack1 = self.pack(loop1, 0, 2)
+ pack2 = self.pack(loop1, 2, 4)
+ loop2 = self.schedule(loop1, [pack1,pack2], prepend_invariant=True)
+ loop3 = self.parse("""
+ v1[ui32|2] = vec_raw_load(p0, i1, 2, descr=float)
+ i10 = vec_int_unpack(v1[ui32|2], 0, 1)
+ raw_store(p0, i3, i10, descr=float)
+ i11 = vec_int_unpack(v1[ui32|2], 1, 1)
+ raw_store(p0, i4, i11, descr=float)
+ """, False)
+ # unfortunate ui32 is the type for float32... the unsigned u is for
+ # the tests
+ self.assert_equal(loop2, loop3)
+
+ def test_split_arith(self):
+ loop1 = self.parse("""
+ i10 = int_and(255, i1)
+ i11 = int_and(255, i1)
+ """)
+ pack1 = self.pack(loop1, 0, 2)
+ loop2 = self.schedule(loop1, [pack1], prepend_invariant=True)
+ loop3 = self.parse("""
+ v1[i64|2] = vec_int_expand(255)
+ v2[i64|2] = vec_int_expand(i1)
+ v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2])
+ """, False)
+ self.assert_equal(loop2, loop3)
+
+ def test_split_arith(self):
+ loop1 = self.parse("""
+ i10 = int_and(255, i1)
+ i11 = int_and(255, i1)
+ """)
+ pack1 = self.pack(loop1, 0, 2)
+ loop2 = self.schedule(loop1, [pack1], prepend_invariant=True)
+ loop3 = self.parse("""
+ v1[i64|2] = vec_int_expand(255)
+ v2[i64|2] = vec_int_expand(i1)
+ v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2])
+ """, False)
+ self.assert_equal(loop2, loop3)
+
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1397,21 +1397,18 @@
jump(p0, p1, p5, p6, p7, p17, p19, i68, i39, i44, i49, i51)
"""
trace="""
- [p3, i4, p1, i5, i6, i7]
- guard_early_exit(descr=<ResumeAtLoopHeaderDescr object at 0x7f3afe4fb830>) [p1, i5, i4, p3]
- i8 = raw_load(i6, i5, descr=intarraydescr)
- guard_not_invalidated(descr=<ResumeGuardNotInvalidated object at 0x7f3afe4fb888>) [p1, i8, i5, i4, p3]
- i10 = int_and(i8, 255)
- guard_false(i10, descr=<ResumeGuardFalseDescr object at 0x7f3afe4fb8e0>) [p1, i5, i4, p3]
- i13 = getarrayitem_raw(139891327308826, 2, descr=chararraydescr)
- guard_value(i13, 1, descr=<ResumeGuardValueDescr object at 0x7f3afe4fb938>) [i13, p1, i5, i4, p3]
- i17 = getarrayitem_raw(139891327308824, 1, descr=chararraydescr)
- i19 = int_add(i4, 1)
- i21 = int_add(i5, 8)
- i22 = int_ge(i19, i7)
- guard_false(i22, descr=<ResumeGuardFalseDescr object at 0x7f3afe4fb990>) [i17, p1, i21, i19, None, None, p3]
- guard_value(i17, 2, descr=<ResumeGuardValueDescr object at 0x7f3afe4fb9e8>) [i17, p1, i21, i19, None, None, p3]
- jump(p3, i19, p1, i21, i6, i7)
+ [p0, p3, i4, i5, i6, i7]
+ guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f492da84250>) [p0, p3, i4, i5]
+ f8 = raw_load(i6, i5, descr=floatarraydescr)
+ guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f492da846d0>) [p0, f8, p3, i4, i5]
+ i9 = cast_float_to_int(f8)
+ i11 = int_and(i9, 255)
+ guard_true(i11, descr=<rpython.jit.metainterp.compile.ResumeGuardTrueDescr object at 0x7f492da8b790>) [p0, p3, i4, i5]
+ i13 = int_add(i4, 1)
+ i15 = int_add(i5, 8)
+ i16 = int_ge(i13, i7)
+ guard_false(i16, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f492da93610>) [p0, i13, i15, p3, None, None]
+ jump(p0, p3, i13, i15, i6, i7)
"""
opt = self.vectorize(self.parse_loop(trace))
self.debug_print_operations(opt.loop)
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -339,6 +339,11 @@
for rdep in pack.right.depends():
lnode = ldep.to
rnode = rdep.to
+ # only valid if the result of the left is in args of pack left
+ result = lnode.getoperation().result
+ args = pack.left.getoperation().getarglist()
+ if result is None or result not in args:
+ continue
isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
if isomorph and lnode.is_before(rnode):
pair = self.packset.can_be_packed(lnode, rnode, pack, False)
@@ -351,6 +356,10 @@
for rdep in pack.right.provides():
lnode = ldep.to
rnode = rdep.to
+ result = pack.left.getoperation().result
+ args = lnode.getoperation().getarglist()
+ if result is None or result not in args:
+ continue
isomorph = isomorphic(lnode.getoperation(), rnode.getoperation())
if isomorph and lnode.is_before(rnode):
pair = self.packset.can_be_packed(lnode, rnode, pack, True)
diff --git a/rpython/jit/metainterp/warmstate.py b/rpython/jit/metainterp/warmstate.py
--- a/rpython/jit/metainterp/warmstate.py
+++ b/rpython/jit/metainterp/warmstate.py
@@ -379,7 +379,7 @@
# so that it will keep it alive for a longer time
warmrunnerdesc.memory_manager.keep_loop_alive(loop_token)
# XXX debug purpose only
- jitdriver_sd.xxxbench.xxx_clock_stop()
+ jitdriver_sd.xxxbench.xxx_clock_stop(fail=True)
# XXX debug purpose only end
#
# Handle the failure
More information about the pypy-commit
mailing list