[pypy-commit] pypy py3.6: merge default
cfbolz
pypy.commits at gmail.com
Mon Sep 16 04:30:34 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: py3.6
Changeset: r97486:a94f909131d8
Date: 2019-09-16 10:29 +0200
http://bitbucket.org/pypy/pypy/changeset/a94f909131d8/
Log: merge default
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -31,7 +31,7 @@
# Fast version of the "strict" errors handler.
def raise_unicode_exception_encode(errors, encoding, msg, utf8,
startingpos, endingpos):
- u_len = rutf8.get_utf8_length(utf8)
+ u_len = rutf8.codepoints_in_utf8(utf8)
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
space.newutf8(utf8, u_len),
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,4 +1,4 @@
-from rpython.rlib.rutf8 import get_utf8_length, next_codepoint_pos
+from rpython.rlib.rutf8 import codepoints_in_utf8, next_codepoint_pos
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import (
@@ -98,7 +98,7 @@
return result
def write(self, string):
- length = get_utf8_length(string)
+ length = codepoints_in_utf8(string)
if self.pos + length > len(self.data):
self.resize(self.pos + length)
pos = 0
@@ -173,7 +173,7 @@
if readnl is None:
w_readnl = space.w_None
else:
- w_readnl = space.str(space.newutf8(readnl, get_utf8_length(readnl))) # YYY
+ w_readnl = space.str(space.newutf8(readnl, codepoints_in_utf8(readnl))) # YYY
return space.newtuple([
w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
])
@@ -239,7 +239,7 @@
w_decoded = space.call_method(
w_decoded, "replace",
space.newtext("\n"),
- space.newutf8(writenl, get_utf8_length(writenl)),
+ space.newutf8(writenl, codepoints_in_utf8(writenl)),
)
string = space.utf8_w(w_decoded)
if string:
@@ -251,7 +251,7 @@
self._check_closed(space)
size = convert_size(space, w_size)
v = self.buf.read(size)
- lgt = get_utf8_length(v)
+ lgt = codepoints_in_utf8(v)
return space.newutf8(v, lgt)
def readline_w(self, space, w_limit=None):
@@ -266,7 +266,7 @@
else:
newline = self.readnl
result = self.buf.readline(newline, limit)
- resultlen = get_utf8_length(result)
+ resultlen = codepoints_in_utf8(result)
return space.newutf8(result, resultlen)
@@ -305,7 +305,7 @@
def getvalue_w(self, space):
self._check_closed(space)
v = self.buf.getvalue()
- lgt = get_utf8_length(v)
+ lgt = codepoints_in_utf8(v)
return space.newutf8(v, lgt)
def readable_w(self, space):
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -12,7 +12,7 @@
from rpython.rlib.rbigint import rbigint
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
- codepoints_in_utf8, get_utf8_length,
+ codepoints_in_utf8, codepoints_in_utf8,
Utf8StringBuilder)
@@ -905,7 +905,7 @@
haslf = True
if haslf and self.writetranslate and self.writenl:
w_text = space.call_method(w_text, "replace", space.newutf8('\n', 1),
- space.newutf8(self.writenl, get_utf8_length(self.writenl)))
+ space.newutf8(self.writenl, codepoints_in_utf8(self.writenl)))
text = space.utf8_w(w_text)
needflush = False
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
replace, end, rettype = errorcb(errors, namecb, reason,
stringdata, start, end)
# 'replace' is UTF8 encoded unicode, rettype is 'u'
- lgt = rutf8.get_utf8_length(replace)
+ lgt = rutf8.codepoints_in_utf8(replace)
inbuf = rffi.utf82wcharp(replace, lgt)
try:
r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -67,7 +67,7 @@
pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- lgt = rutf8.get_utf8_length(output)
+ lgt = rutf8.codepoints_in_utf8(output)
return space.newutf8(output, lgt)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,7 +27,7 @@
raise wrap_unicodedecodeerror(space, e, input, self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
- lgt = rutf8.get_utf8_length(utf8_output)
+ lgt = rutf8.codepoints_in_utf8(utf8_output)
return space.newtuple([space.newutf8(utf8_output, lgt),
space.newint(len(input))])
diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
codecname, string = argv[1], argv[2]
c = c_codecs.getcodec(codecname)
u = c_codecs.decode(c, string)
- lgt = rutf8.get_utf8_length(u)
+ lgt = rutf8.codepoints_in_utf8(u)
r = c_codecs.encode(c, u, lgt)
print r
return 0
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -49,7 +49,7 @@
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
s = ctx._utf8[start:end]
- lgt = rutf8.get_utf8_length(s)
+ lgt = rutf8.codepoints_in_utf8(s)
return space.newutf8(s, lgt)
else:
# unreachable
@@ -496,7 +496,7 @@
elif use_builder == 'U':
assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
return space.newutf8(result_bytes,
- rutf8.get_utf8_length(result_bytes)), n
+ rutf8.codepoints_in_utf8(result_bytes)), n
else:
raise AssertionError(use_builder)
else:
@@ -788,7 +788,7 @@
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
- lgt = rutf8.get_utf8_length(ctx._utf8)
+ lgt = rutf8.codepoints_in_utf8(ctx._utf8)
return space.newutf8(ctx._utf8, lgt)
else:
raise SystemError
diff --git a/pypy/module/micronumpy/boxes.py b/pypy/module/micronumpy/boxes.py
--- a/pypy/module/micronumpy/boxes.py
+++ b/pypy/module/micronumpy/boxes.py
@@ -11,7 +11,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.objectmodel import specialize
from rpython.rlib import jit
-from rpython.rlib.rutf8 import get_utf8_length
+from rpython.rlib.rutf8 import codepoints_in_utf8
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.tool.sourcetools import func_with_new_name
from pypy.module.micronumpy import constants as NPY
@@ -629,7 +629,7 @@
return self
elif dtype.is_object():
return W_ObjectBox(space.newutf8(self._value,
- get_utf8_length(self._value)))
+ codepoints_in_utf8(self._value)))
else:
raise oefmt(space.w_NotImplementedError,
"Conversion from unicode not implemented yet")
diff --git a/pypy/module/micronumpy/types.py b/pypy/module/micronumpy/types.py
--- a/pypy/module/micronumpy/types.py
+++ b/pypy/module/micronumpy/types.py
@@ -1,7 +1,7 @@
import functools
import math
from rpython.rlib.unroll import unrolling_iterable
-from rpython.rlib.rutf8 import Utf8StringIterator, get_utf8_length, Utf8StringBuilder
+from rpython.rlib.rutf8 import Utf8StringIterator, codepoints_in_utf8, Utf8StringBuilder
from pypy.interpreter.error import OperationError, oefmt
from pypy.objspace.std.floatobject import float2string
from pypy.objspace.std.complexobject import str_format
@@ -2330,7 +2330,7 @@
def to_builtin_type(self, space, box):
assert isinstance(box, boxes.W_UnicodeBox)
- return space.newutf8(box._value, get_utf8_length(box._value))
+ return space.newutf8(box._value, codepoints_in_utf8(box._value))
def eq(self, v1, v2):
assert isinstance(v1, boxes.W_UnicodeBox)
diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -6,12 +6,12 @@
pytest.skip("hypothesis required")
from pypy.module.unicodedata.interp_ucd import ucd
-from rpython.rlib.rutf8 import get_utf8_length
+from rpython.rlib.rutf8 import codepoints_in_utf8
def make_normalization(space, NF_code):
def normalize(s):
u = s.encode('utf8')
- w_s = space.newutf8(u, get_utf8_length(u))
+ w_s = space.newutf8(u, codepoints_in_utf8(u))
w_res = ucd.normalize(space, NF_code, w_s)
return space.utf8_w(w_res).decode('utf8')
return normalize
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -363,25 +363,12 @@
raise CheckError(~res)
def get_utf8_length(s, start=0, end=-1):
+ # DEPRECATED! use codepoints_in_utf8 instead
""" Get the length out of valid utf8.
"""
if end < 0:
end = len(s)
- res = 0
- pos = start
- while pos < end:
- ordch1 = ord(s[pos])
- res += 1
- if ordch1 <= 0x7F:
- pos += 1
- elif ordch1 <= 0xDF:
- pos += 2
- elif ordch1 <= 0xEF:
- pos += 3
- elif ordch1 <= 0xF4:
- pos += 4
-
- return res
+ return codepoints_in_utf8(s, start, end)
@jit.elidable
def _check_utf8(s, allow_surrogates, start, stop):
@@ -761,13 +748,13 @@
def append(self, s):
# for strings
self._s.append(s)
- newlgt = get_utf8_length(s)
+ newlgt = codepoints_in_utf8(s)
self._lgt += newlgt
@always_inline
def append_slice(self, s, start, end):
self._s.append_slice(s, start, end)
- newlgt = get_utf8_length(s, start, end)
+ newlgt = codepoints_in_utf8(s, start, end)
self._lgt += newlgt
@signature(types.self(), char(), returns=none())
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -169,14 +169,6 @@
expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
assert result == expected
- at given(strategies.lists(strategies.characters()))
-def test_get_utf8_length(unichars):
- u = u''.join(unichars)
- exp_lgt = len(u)
- s = ''.join([c.encode('utf8') for c in u])
- lgt = rutf8.get_utf8_length(s)
- if not _has_surrogates(s) or sys.maxunicode > 0xffff:
- assert lgt == exp_lgt
def test_utf8_string_builder():
s = rutf8.Utf8StringBuilder()
More information about the pypy-commit
mailing list