[pypy-commit] pypy py3.6: faster implementation of get_utf8_length that makes use of the fact that the string in question must already be valid utf-8
cfbolz
pypy.commits at gmail.com
Sun Sep 15 16:55:58 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: py3.6
Changeset: r97484:17c06a8891ad
Date: 2019-09-15 22:55 +0200
http://bitbucket.org/pypy/pypy/changeset/17c06a8891ad/
Log: faster implementation of get_utf8_length that makes use of the fact
that the string in question must already be valid utf-8
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -363,9 +363,25 @@
raise CheckError(~res)
def get_utf8_length(s, start=0, end=-1):
- """ Get the length out of valid utf8. For now just calls check_utf8
+ """ Get the length out of valid utf8.
"""
- return check_utf8(s, True, start, end)
+ if end < 0:
+ end = len(s)
+ res = 0
+ pos = start
+ while pos < end:
+ ordch1 = ord(s[pos])
+ res += 1
+ if ordch1 <= 0x7F:
+ pos += 1
+ elif ordch1 <= 0xDF:
+ pos += 2
+ elif ordch1 <= 0xEF:
+ pos += 3
+ elif ordch1 <= 0xF4:
+ pos += 4
+
+ return res
@jit.elidable
def _check_utf8(s, allow_surrogates, start, stop):
More information about the pypy-commit
mailing list