[pypy-commit] pypy py3.6: merge default

Mon Sep 16 04:30:34 EDT 2019

Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: py3.6
Changeset: r97486:a94f909131d8
Date: 2019-09-16 10:29 +0200
http://bitbucket.org/pypy/pypy/changeset/a94f909131d8/

Log:	merge default

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -31,7 +31,7 @@
     # Fast version of the "strict" errors handler.
     def raise_unicode_exception_encode(errors, encoding, msg, utf8,
                                        startingpos, endingpos):
-        u_len = rutf8.get_utf8_length(utf8)
+        u_len = rutf8.codepoints_in_utf8(utf8)
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
                                              space.newutf8(utf8, u_len),
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,4 +1,4 @@
-from rpython.rlib.rutf8 import get_utf8_length, next_codepoint_pos
+from rpython.rlib.rutf8 import codepoints_in_utf8, next_codepoint_pos
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import (
@@ -98,7 +98,7 @@
         return result
 
     def write(self, string):
-        length = get_utf8_length(string)
+        length = codepoints_in_utf8(string)
         if self.pos + length > len(self.data):
             self.resize(self.pos + length)
         pos = 0
@@ -173,7 +173,7 @@
         if readnl is None:
             w_readnl = space.w_None
         else:
-            w_readnl = space.str(space.newutf8(readnl, get_utf8_length(readnl)))  # YYY
+            w_readnl = space.str(space.newutf8(readnl, codepoints_in_utf8(readnl)))  # YYY
         return space.newtuple([
             w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
         ])
@@ -239,7 +239,7 @@
             w_decoded = space.call_method(
                 w_decoded, "replace",
                 space.newtext("\n"),
-                space.newutf8(writenl, get_utf8_length(writenl)),
+                space.newutf8(writenl, codepoints_in_utf8(writenl)),
             )
         string = space.utf8_w(w_decoded)
         if string:
@@ -251,7 +251,7 @@
         self._check_closed(space)
         size = convert_size(space, w_size)
         v = self.buf.read(size)
-        lgt = get_utf8_length(v)
+        lgt = codepoints_in_utf8(v)
         return space.newutf8(v, lgt)
 
     def readline_w(self, space, w_limit=None):
@@ -266,7 +266,7 @@
             else:
                 newline = self.readnl
             result = self.buf.readline(newline, limit)
-        resultlen = get_utf8_length(result)
+        resultlen = codepoints_in_utf8(result)
         return space.newutf8(result, resultlen)
 
 
@@ -305,7 +305,7 @@
     def getvalue_w(self, space):
         self._check_closed(space)
         v = self.buf.getvalue()
-        lgt = get_utf8_length(v)
+        lgt = codepoints_in_utf8(v)
         return space.newutf8(v, lgt)
 
     def readable_w(self, space):
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -12,7 +12,7 @@
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
-                                codepoints_in_utf8, get_utf8_length,
+                                codepoints_in_utf8, codepoints_in_utf8,
                                 Utf8StringBuilder)
 
 
@@ -905,7 +905,7 @@
                 haslf = True
         if haslf and self.writetranslate and self.writenl:
             w_text = space.call_method(w_text, "replace", space.newutf8('\n', 1),
-                               space.newutf8(self.writenl, get_utf8_length(self.writenl)))
+                               space.newutf8(self.writenl, codepoints_in_utf8(self.writenl)))
             text = space.utf8_w(w_text)
 
         needflush = False
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
         replace, end, rettype = errorcb(errors, namecb, reason,
                                stringdata, start, end)
         # 'replace' is UTF8 encoded unicode, rettype is 'u'
-    lgt = rutf8.get_utf8_length(replace)
+    lgt = rutf8.codepoints_in_utf8(replace)
     inbuf = rffi.utf82wcharp(replace, lgt)
     try:
         r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -67,7 +67,7 @@
         pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
         assert 0 <= pos <= len(object)
         self.pending = object[pos:]
-        lgt = rutf8.get_utf8_length(output)
+        lgt = rutf8.codepoints_in_utf8(output)
         return space.newutf8(output, lgt)
 
 
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,7 +27,7 @@
             raise wrap_unicodedecodeerror(space, e, input, self.name)
         except RuntimeError:
             raise wrap_runtimeerror(space)
-        lgt = rutf8.get_utf8_length(utf8_output)
+        lgt = rutf8.codepoints_in_utf8(utf8_output)
         return space.newtuple([space.newutf8(utf8_output, lgt),
                                space.newint(len(input))])
 
diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
             codecname, string = argv[1], argv[2]
             c = c_codecs.getcodec(codecname)
             u = c_codecs.decode(c, string)
-            lgt = rutf8.get_utf8_length(u)
+            lgt = rutf8.codepoints_in_utf8(u)
             r = c_codecs.encode(c, u, lgt)
             print r
             return 0
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -49,7 +49,7 @@
             return space.newbytes(ctx._string[start:end])
         elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
             s = ctx._utf8[start:end]
-            lgt = rutf8.get_utf8_length(s)
+            lgt = rutf8.codepoints_in_utf8(s)
             return space.newutf8(s, lgt)
         else:
             # unreachable
@@ -496,7 +496,7 @@
             elif use_builder == 'U':
                 assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
                 return space.newutf8(result_bytes,
-                                     rutf8.get_utf8_length(result_bytes)), n
+                                     rutf8.codepoints_in_utf8(result_bytes)), n
             else:
                 raise AssertionError(use_builder)
         else:
@@ -788,7 +788,7 @@
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
         elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            lgt = rutf8.get_utf8_length(ctx._utf8)
+            lgt = rutf8.codepoints_in_utf8(ctx._utf8)
             return space.newutf8(ctx._utf8, lgt)
         else:
             raise SystemError
diff --git a/pypy/module/micronumpy/boxes.py b/pypy/module/micronumpy/boxes.py
--- a/pypy/module/micronumpy/boxes.py
+++ b/pypy/module/micronumpy/boxes.py
@@ -11,7 +11,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import jit
-from rpython.rlib.rutf8 import get_utf8_length
+from rpython.rlib.rutf8 import codepoints_in_utf8
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.tool.sourcetools import func_with_new_name
 from pypy.module.micronumpy import constants as NPY
@@ -629,7 +629,7 @@
             return self
         elif dtype.is_object():
             return W_ObjectBox(space.newutf8(self._value,
-                               get_utf8_length(self._value)))
+                               codepoints_in_utf8(self._value)))
         else:
             raise oefmt(space.w_NotImplementedError,
                         "Conversion from unicode not implemented yet")
diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py
--- a/pypy/module/micronumpy/types.py
+++ b/pypy/module/micronumpy/types.py
@@ -1,7 +1,7 @@
 import functools
 import math
 from rpython.rlib.unroll import unrolling_iterable
-from rpython.rlib.rutf8 import Utf8StringIterator, get_utf8_length, Utf8StringBuilder
+from rpython.rlib.rutf8 import Utf8StringIterator, codepoints_in_utf8, Utf8StringBuilder
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.objspace.std.floatobject import float2string
 from pypy.objspace.std.complexobject import str_format
@@ -2330,7 +2330,7 @@
 
     def to_builtin_type(self, space, box):
         assert isinstance(box, boxes.W_UnicodeBox)
-        return space.newutf8(box._value, get_utf8_length(box._value))
+        return space.newutf8(box._value, codepoints_in_utf8(box._value))
 
     def eq(self, v1, v2):
         assert isinstance(v1, boxes.W_UnicodeBox)
diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -6,12 +6,12 @@
     pytest.skip("hypothesis required")
 
 from pypy.module.unicodedata.interp_ucd import ucd
-from rpython.rlib.rutf8 import get_utf8_length
+from rpython.rlib.rutf8 import codepoints_in_utf8
 
 def make_normalization(space, NF_code):
     def normalize(s):
         u = s.encode('utf8')
-        w_s = space.newutf8(u, get_utf8_length(u))
+        w_s = space.newutf8(u, codepoints_in_utf8(u))
         w_res = ucd.normalize(space, NF_code, w_s)
         return space.utf8_w(w_res).decode('utf8')
     return normalize
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -363,25 +363,12 @@
     raise CheckError(~res)
 
 def get_utf8_length(s, start=0, end=-1):
+    # DEPRECATED! use codepoints_in_utf8 instead
     """ Get the length out of valid utf8.
     """
     if end < 0:
         end = len(s)
-    res = 0
-    pos = start
-    while pos < end:
-        ordch1 = ord(s[pos])
-        res += 1
-        if ordch1 <= 0x7F:
-            pos += 1
-        elif ordch1 <= 0xDF:
-            pos += 2
-        elif ordch1 <= 0xEF:
-            pos += 3
-        elif ordch1 <= 0xF4:
-            pos += 4
-
-    return res
+    return codepoints_in_utf8(s, start, end)
 
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):
@@ -761,13 +748,13 @@
     def append(self, s):
         # for strings
         self._s.append(s)
-        newlgt = get_utf8_length(s)
+        newlgt = codepoints_in_utf8(s)
         self._lgt += newlgt
 
     @always_inline
     def append_slice(self, s, start, end):
         self._s.append_slice(s, start, end)
-        newlgt = get_utf8_length(s, start, end)
+        newlgt = codepoints_in_utf8(s, start, end)
         self._lgt += newlgt
 
     @signature(types.self(), char(), returns=none())
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -169,14 +169,6 @@
     expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
     assert result == expected
 
- at given(strategies.lists(strategies.characters()))
-def test_get_utf8_length(unichars):
-    u = u''.join(unichars)
-    exp_lgt = len(u)
-    s = ''.join([c.encode('utf8') for c in u])
-    lgt = rutf8.get_utf8_length(s)
-    if not _has_surrogates(s) or sys.maxunicode > 0xffff:
-        assert lgt == exp_lgt
 
 def test_utf8_string_builder():
     s = rutf8.Utf8StringBuilder()