[pypy-commit] pypy default: Utf8StringPosIterator to iterate over character, position pairs

cfbolz pypy.commits at gmail.com
Tue Feb 26 10:02:59 EST 2019


Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: 
Changeset: r96168:5512e76a8a54
Date: 2019-02-26 15:57 +0100
http://bitbucket.org/pypy/pypy/changeset/5512e76a8a54/

Log:	Utf8StringPosIterator to iterate over character, position pairs

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -153,6 +153,7 @@
     """Gives the position of the previous codepoint.
     'pos' must not be zero.
     """
+    assert pos != 0
     pos -= 1
     if pos >= len(code):     # for the case where pos - 1 == len(code):
         assert pos >= 0
@@ -811,6 +812,18 @@
                    (0xF0   << 18) + (0x80   << 12) + (0x80   << 6) + 0x80     )
         assert False, "unreachable"
 
+class Utf8StringPosIterator(object):
+    def __init__(self, utf8s):
+        self.it = Utf8StringIterator(utf8s)
+
+    def __iter__(self):
+        return self
+
+    @always_inline
+    def next(self):
+        pos = self.it.get_pos()
+        return (self.it.next(), pos)
+
 
 def decode_latin_1(s):
     if len(s) == 0:
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -212,3 +212,16 @@
     for c in u:
         l.append(unichr(c))
     assert list(arg) == l
+
+ at given(strategies.text())
+def test_utf8_iterator_pos(arg):
+    utf8s = arg.encode('utf8')
+    u = rutf8.Utf8StringPosIterator(utf8s)
+    l = []
+    i = 0
+    for c, pos in u:
+        l.append(unichr(c))
+        assert c == rutf8.codepoint_at_pos(utf8s, pos)
+        assert pos == i
+        i = rutf8.next_codepoint_pos(utf8s, i)
+    assert list(arg) == l


More information about the pypy-commit mailing list