[pypy-commit] pypy unicode-utf8: Add a utility function that I might use in rsre

arigo pypy.commits at gmail.com
Mon Nov 27 17:11:12 EST 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93187:7839b53125bb
Date: 2017-11-27 23:09 +0100
http://bitbucket.org/pypy/pypy/changeset/7839b53125bb/

Log:	Add a utility function that I might use in rsre

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -567,6 +567,31 @@
         bytepos = next_codepoint_pos(utf8, bytepos)
     return codepoint_at_pos(utf8, bytepos)
 
+ at jit.dont_look_inside
+def codepoint_index_at_byte_position(utf8, storage, bytepos):
+    """ Return the character index for which
+    codepoint_position_at_index(index) == bytepos.
+    This is a relatively slow operation in that it runs in a time
+    logarithmic in the length of the string, plus some constant that
+    is not tiny either.
+    """
+    index_min = 0
+    index_max = len(storage.contents) - 1
+    while index_min < index_max:
+        index_middle = (index_min + index_max + 1) // 2
+        base_bytepos = storage.contents[index_middle].baseindex
+        if bytepos < base_bytepos:
+            index_max = index_middle - 1
+        else:
+            index_min = index_middle
+    bytepos1 = storage.contents[index_min].baseindex
+    result = index_min << 6
+    while bytepos1 < bytepos:
+        bytepos1 = next_codepoint_pos(utf8, bytepos1)
+        result += 1
+    return result
+
+
 def make_utf8_escape_function(pass_printable=False, quotes=False, prefix=None):
     @jit.elidable
     def unicode_escape(s):
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -128,6 +128,17 @@
         assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
                 len(u[:i].encode('utf8')))
 
+ at given(strategies.text(average_size=140))
+ at example(u'x' * 64 * 5)
+ at example(u'x' * (64 * 5 - 1))
+def test_codepoint_index_at_byte_position(u):
+    storage = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
+    for i in range(len(u) + 1):
+        bytepos = len(u[:i].encode('utf8'))
+        assert rutf8.codepoint_index_at_byte_position(
+                       u.encode('utf8'), storage, bytepos) == i
+
+
 repr_func = rutf8.make_utf8_escape_function(prefix='u', pass_printable=False,
                                             quotes=True)
 


More information about the pypy-commit mailing list