[pypy-commit] pypy unicode-utf8: Add a utility function that I might use in rsre
arigo
pypy.commits at gmail.com
Mon Nov 27 17:11:12 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93187:7839b53125bb
Date: 2017-11-27 23:09 +0100
http://bitbucket.org/pypy/pypy/changeset/7839b53125bb/
Log: Add a utility function that I might use in rsre
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -567,6 +567,31 @@
bytepos = next_codepoint_pos(utf8, bytepos)
return codepoint_at_pos(utf8, bytepos)
+ at jit.dont_look_inside
+def codepoint_index_at_byte_position(utf8, storage, bytepos):
+ """ Return the character index for which
+ codepoint_position_at_index(index) == bytepos.
+ This is a relatively slow operation in that it runs in a time
+ logarithmic in the length of the string, plus some constant that
+ is not tiny either.
+ """
+ index_min = 0
+ index_max = len(storage.contents) - 1
+ while index_min < index_max:
+ index_middle = (index_min + index_max + 1) // 2
+ base_bytepos = storage.contents[index_middle].baseindex
+ if bytepos < base_bytepos:
+ index_max = index_middle - 1
+ else:
+ index_min = index_middle
+ bytepos1 = storage.contents[index_min].baseindex
+ result = index_min << 6
+ while bytepos1 < bytepos:
+ bytepos1 = next_codepoint_pos(utf8, bytepos1)
+ result += 1
+ return result
+
+
def make_utf8_escape_function(pass_printable=False, quotes=False, prefix=None):
@jit.elidable
def unicode_escape(s):
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -128,6 +128,17 @@
assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
len(u[:i].encode('utf8')))
+ at given(strategies.text(average_size=140))
+ at example(u'x' * 64 * 5)
+ at example(u'x' * (64 * 5 - 1))
+def test_codepoint_index_at_byte_position(u):
+ storage = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+ for i in range(len(u) + 1):
+ bytepos = len(u[:i].encode('utf8'))
+ assert rutf8.codepoint_index_at_byte_position(
+ u.encode('utf8'), storage, bytepos) == i
+
+
repr_func = rutf8.make_utf8_escape_function(prefix='u', pass_printable=False,
quotes=True)
More information about the pypy-commit
mailing list