[pypy-commit] pypy default: two optimizations of the jitting of unicode indexing/slicing:
cfbolz
pypy.commits at gmail.com
Sun Sep 15 08:03:06 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r97480:f6dcc827f8b0
Date: 2019-09-15 14:01 +0200
http://bitbucket.org/pypy/pypy/changeset/f6dcc827f8b0/
Log: two optimizations of the jitting of unicode indexing/slicing:
- don't make a bridge for all four sizes of codepoints each!
- fast paths in the jit for small constant indices to not create an
index structure
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -31,9 +31,6 @@
i89 = strgetitem(p55, i83)
i24 = int_ge(i83, i12)
guard_false(i24, descr=...)
- i87 = strgetitem(p13, i83)
- i91 = int_le(i87, 127)
- guard_true(i91, descr=...)
i93 = int_add(i83, 1)
i94 = int_gt(i93, i56)
guard_false(i94, descr=...)
@@ -224,3 +221,72 @@
--TICK--
jump(..., descr=...)
""")
+
+ def test_unicode_indexing_makes_no_bridges(self):
+ log = self.run("""
+ u = u"aaaaaä👩👩👧👦" * 1000
+ def main():
+ for j in range(10):
+ for i in range(len(u)):
+ u[i] # ID: index0
+ """, [])
+ ops = log.loops[0].ops_by_id("index0")
+ for op in ops:
+ assert op.bridge is None
+
+ def test_unicode_indexing_small_constant_indices(self):
+ log = self.run("""
+ l = [u"abä", u"cdä", u"äü", u"éé", u"–—¿"] * 1000
+ def main(n):
+ global s
+ for u in l:
+ s = u[0] + u[1] + u[-1] # ID: index
+ len(u)
+ return len(s)
+ """, [1000])
+ loop, = log.loops_by_filename(self.filepath)
+ assert loop.match_by_id('index', '''
+ i77 = getfield_gc_i(p73, descr=<FieldS pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__length .*>)
+ p78 = getfield_gc_r(p73, descr=<FieldP pypy.objspace.std.unicodeobject.W_UnicodeObject.inst__utf8 .* pure>)
+ i79 = strlen(p78)
+ i80 = int_eq(i77, i79)
+ guard_false(i80, descr=...) # check not ascii
+ i82 = int_ge(0, i77)
+ guard_false(i82, descr=...)
+ i85 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, 0, descr=...)
+ i86 = int_gt(i85, i79)
+ guard_false(i86, descr=...)
+ i88 = int_ge(1, i77)
+ guard_false(i88, descr=...)
+ i90 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p78, i85, descr=...)
+ i91 = int_gt(i90, i79)
+ guard_false(i91, descr=...)
+ i92 = int_sub(i90, i85)
+ i94 = int_add(-1, i77)
+ i96 = call_i(ConstClass(prev_codepoint_pos_dont_look_inside), p78, i79, descr=...)
+ i97 = int_sub(i79, i96)
+ guard_not_invalidated(descr=...)
+ ''')
+
+ def test_unicode_slicing_small_constant_indices(self):
+ log = self.run("""
+ def main(n):
+ u = u"abä👩👩👧👦éé–—¿" * 1000
+ global s
+ count = 0
+ while u:
+ u = u[1:] # ID: index
+ count += 1
+ return count
+ """, [1000])
+ loop, = log.loops_by_filename(self.filepath)
+ assert loop.match_by_id('index', '''
+ i51 = int_eq(1, i38)
+ guard_false(i51, descr=...)
+ i52 = strlen(p47)
+ i53 = int_eq(i38, i52)
+ guard_false(i53, descr=...)
+ i56 = call_i(ConstClass(next_codepoint_pos_dont_look_inside), p47, 0, descr=...)
+ i57 = int_sub(i52, i56)
+ i59 = int_sub(i38, 1)
+ ''')
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -187,6 +187,26 @@
space.newint(start + len1))
assert w_res is space.newbool(expected)
+ def test_getitem_constant_index_jit(self):
+ # test it directly, to prevent only seeing bugs in jitted code
+ space = self.space
+ u = u"äöabc"
+ w_u = self.space.wrap(u)
+ for i in range(-len(u), len(u)):
+ assert w_u._getitem_result_constant_index_jit(space, i)._utf8 == u[i].encode("utf-8")
+ with py.test.raises(OperationError):
+ w_u._getitem_result_constant_index_jit(space, len(u))
+ with py.test.raises(OperationError):
+ w_u._getitem_result_constant_index_jit(space, -len(u) - 1)
+
+ def test_getslice_constant_index_jit(self):
+ space = self.space
+ u = u"äöabcéééß"
+ w_u = self.space.wrap(u)
+ for start in range(0, 4):
+ for end in range(start, len(u)):
+ assert w_u._unicode_sliced_constant_index_jit(space, start, end)._utf8 == u[start: end].encode("utf-8")
+
class AppTestUnicodeStringStdOnly:
def test_compares(self):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -31,6 +31,16 @@
'encode_object', 'decode_object', 'unicode_from_object',
'unicode_from_string', 'unicode_to_decimal_w']
+MAX_UNROLL_NEXT_CODEPOINT_POS = 4
+
+ at jit.elidable
+def next_codepoint_pos_dont_look_inside(utf8, p):
+ return rutf8.next_codepoint_pos(utf8, p)
+
+ at jit.elidable
+def prev_codepoint_pos_dont_look_inside(utf8, p):
+ return rutf8.prev_codepoint_pos(utf8, p)
+
class W_UnicodeObject(W_Root):
import_from_mixin(StringMethods)
@@ -698,6 +708,9 @@
if sl == 0:
return self._empty()
elif step == 1:
+ if jit.we_are_jitted() and \
+ self._unroll_slice_heuristic(start, stop, w_index.w_stop):
+ return self._unicode_sliced_constant_index_jit(space, start, stop)
assert start >= 0 and stop >= 0
return self._unicode_sliced(space, start, stop)
else:
@@ -726,6 +739,9 @@
if start == stop:
return self._empty()
else:
+ if (jit.we_are_jitted() and
+ self._unroll_slice_heuristic(start, stop, w_stop)):
+ return self._unicode_sliced_constant_index_jit(space, start, stop)
return self._unicode_sliced(space, start, stop)
def _unicode_sliced(self, space, start, stop):
@@ -737,6 +753,31 @@
byte_stop = self._index_to_byte(stop)
return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+ @jit.unroll_safe
+ def _unicode_sliced_constant_index_jit(self, space, start, stop):
+ assert start >= 0
+ assert stop >= 0
+ byte_start = 0
+ for i in range(start):
+ byte_start = next_codepoint_pos_dont_look_inside(self._utf8, byte_start)
+ byte_stop = len(self._utf8)
+ for i in range(self._len() - stop):
+ byte_stop = prev_codepoint_pos_dont_look_inside(self._utf8, byte_stop)
+ return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+
+ def _unroll_slice_heuristic(self, start, stop, w_stop):
+ from pypy.objspace.std.intobject import W_IntObject
+ # the reason we use the *wrapped* stop is that for
+ # w_stop == wrapped -1, or w_None the stop that is computed will *not*
+ # be constant, because the length is often not constant.
+ return (not self.is_ascii() and
+ jit.isconstant(start) and
+ (jit.isconstant(w_stop) or
+ (isinstance(w_stop, W_IntObject) and
+ jit.isconstant(w_stop.intval))) and
+ start <= MAX_UNROLL_NEXT_CODEPOINT_POS and
+ self._len() - stop <= MAX_UNROLL_NEXT_CODEPOINT_POS)
+
def descr_capitalize(self, space):
value = self._utf8
if len(value) == 0:
@@ -863,12 +904,43 @@
return storage
def _getitem_result(self, space, index):
+ if (jit.we_are_jitted() and
+ not self.is_ascii() and
+ jit.isconstant(index) and
+ -MAX_UNROLL_NEXT_CODEPOINT_POS <= index <= MAX_UNROLL_NEXT_CODEPOINT_POS):
+ return self._getitem_result_constant_index_jit(space, index)
if index < 0:
index += self._length
if index < 0 or index >= self._length:
raise oefmt(space.w_IndexError, "string index out of range")
start = self._index_to_byte(index)
- end = rutf8.next_codepoint_pos(self._utf8, start)
+ # we must not inline next_codepoint_pos, otherwise we produce a guard!
+ end = self.next_codepoint_pos_dont_look_inside(start)
+ return W_UnicodeObject(self._utf8[start:end], 1)
+
+ @jit.unroll_safe
+ def _getitem_result_constant_index_jit(self, space, index):
+ # for small known indices, call next/prev_codepoint_pos a few times
+ # instead of possibly creating an index structure
+ if index < 0:
+ posindex = index + self._length
+ if posindex < 0:
+ raise oefmt(space.w_IndexError, "string index out of range")
+ end = len(self._utf8)
+ start = self.prev_codepoint_pos_dont_look_inside(end)
+ for i in range(-index-1):
+ end = start
+ start = self.prev_codepoint_pos_dont_look_inside(start)
+ else:
+ if index >= self._length:
+ raise oefmt(space.w_IndexError, "string index out of range")
+ start = 0
+ end = self.next_codepoint_pos_dont_look_inside(start)
+ for i in range(index):
+ start = end
+ end = self.next_codepoint_pos_dont_look_inside(end)
+ assert start >= 0
+ assert end >= 0
return W_UnicodeObject(self._utf8[start:end], 1)
def is_ascii(self):
@@ -895,6 +967,16 @@
return rutf8.codepoint_index_at_byte_position(
self._utf8, self._get_index_storage(), bytepos, self._len())
+ def next_codepoint_pos_dont_look_inside(self, pos):
+ if self.is_ascii():
+ return pos + 1
+ return next_codepoint_pos_dont_look_inside(self._utf8, pos)
+
+ def prev_codepoint_pos_dont_look_inside(self, pos):
+ if self.is_ascii():
+ return pos - 1
+ return prev_codepoint_pos_dont_look_inside(self._utf8, pos)
+
@always_inline
def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
w_sub = self.convert_arg_to_w_unicode(space, w_sub)
More information about the pypy-commit
mailing list