[pypy-commit] pypy py3.6: faster implementation of get_utf8_length that makes use of the fact that the string in question must already be valid utf-8

Sun Sep 15 16:55:58 EDT 2019

Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: py3.6
Changeset: r97484:17c06a8891ad
Date: 2019-09-15 22:55 +0200
http://bitbucket.org/pypy/pypy/changeset/17c06a8891ad/

Log:	faster implementation of get_utf8_length that makes use of the fact
	that the string in question must already be valid utf-8

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -363,9 +363,25 @@
     raise CheckError(~res)
 
 def get_utf8_length(s, start=0, end=-1):
-    """ Get the length out of valid utf8. For now just calls check_utf8
+    """ Get the length out of valid utf8.
     """
-    return check_utf8(s, True, start, end)
+    if end < 0:
+        end = len(s)
+    res = 0
+    pos = start
+    while pos < end:
+        ordch1 = ord(s[pos])
+        res += 1
+        if ordch1 <= 0x7F:
+            pos += 1
+        elif ordch1 <= 0xDF:
+            pos += 2
+        elif ordch1 <= 0xEF:
+            pos += 3
+        elif ordch1 <= 0xF4:
+            pos += 4
+
+    return res
 
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):