[pypy-commit] pypy default: Utf8StringPosIterator to iterate over character, position pairs
cfbolz
pypy.commits at gmail.com
Tue Feb 26 10:02:59 EST 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r96168:5512e76a8a54
Date: 2019-02-26 15:57 +0100
http://bitbucket.org/pypy/pypy/changeset/5512e76a8a54/
Log: Utf8StringPosIterator to iterate over character, position pairs
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -153,6 +153,7 @@
"""Gives the position of the previous codepoint.
'pos' must not be zero.
"""
+ assert pos != 0
pos -= 1
if pos >= len(code): # for the case where pos - 1 == len(code):
assert pos >= 0
@@ -811,6 +812,18 @@
(0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 )
assert False, "unreachable"
+class Utf8StringPosIterator(object):
+ def __init__(self, utf8s):
+ self.it = Utf8StringIterator(utf8s)
+
+ def __iter__(self):
+ return self
+
+ @always_inline
+ def next(self):
+ pos = self.it.get_pos()
+ return (self.it.next(), pos)
+
def decode_latin_1(s):
if len(s) == 0:
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -212,3 +212,16 @@
for c in u:
l.append(unichr(c))
assert list(arg) == l
+
+ at given(strategies.text())
+def test_utf8_iterator_pos(arg):
+ utf8s = arg.encode('utf8')
+ u = rutf8.Utf8StringPosIterator(utf8s)
+ l = []
+ i = 0
+ for c, pos in u:
+ l.append(unichr(c))
+ assert c == rutf8.codepoint_at_pos(utf8s, pos)
+ assert pos == i
+ i = rutf8.next_codepoint_pos(utf8s, i)
+ assert list(arg) == l
More information about the pypy-commit
mailing list