[pypy-commit] pypy unicode-utf8-py3: remove more bytes.decode, assert on unused case in module.struct
mattip
pypy.commits at gmail.com
Sat Feb 2 12:37:20 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95777:f165c244dfb4
Date: 2019-02-02 19:06 +0200
http://bitbucket.org/pypy/pypy/changeset/f165c244dfb4/
Log: remove more bytes.decode, assert on unused case in module.struct
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -11,6 +11,8 @@
- rutf8.utf8_encode_mbcs
- unicodehelper.fsencode
- unicodehelper.unicode_to_decimal_w
+ - _winreg.inerp_winreg
+* remove 'assert not isinstance(*, unicode)
* remove asserts from _WIN32 paths in rlib.rposix.re{name,place}
* convert all realunicode_w to unicode_w after we flush out all old uses of
unicode_w
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -65,14 +65,9 @@
def verify_identifier(token):
# 1=ok; 0=not an identifier; -1=bad utf-8
- for c in token:
- if ord(c) >= 0x80:
- break
- else:
- return 1
try:
- u = token.decode('utf-8')
- except UnicodeDecodeError:
+ rutf8.check_utf8(token, False)
+ except rutf8.CheckError:
return -1
from pypy.objspace.std.unicodeobject import _isidentifier
return _isidentifier(token)
diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py
--- a/pypy/module/cpyext/api.py
+++ b/pypy/module/cpyext/api.py
@@ -14,7 +14,7 @@
from rpython.rlib.rfile import (FILEP, c_fread, c_fclose, c_fwrite,
c_fdopen, c_fileno,
c_fopen)# for tests
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.translator import cdir
from rpython.translator.tool.cbuild import ExternalCompilationInfo
from rpython.translator.gensupp import NameManager
@@ -1722,17 +1722,14 @@
raise_import_error(space, space.newtext(msg), w_name, w_path)
def get_init_name(space, w_name):
- name_u = space.utf8_w(w_name).decode('utf8')
- basename_u = name_u.split(u'.')[-1]
- try:
- basename = basename_u.encode('ascii')
+ name = space.utf8_w(w_name)
+ basename = name.split('.')[-1]
+ if rutf8.first_non_ascii_char(basename) == -1:
return 'PyInit_%s' % (basename,)
- except UnicodeEncodeError:
- basename = space.bytes_w(encode_object(
- space, space.newtext(basename_u), 'punycode', None))
- basename = basename.replace('-', '_')
- return 'PyInitU_%s' % (basename,)
-
+ basename = space.bytes_w(encode_object(
+ space, space.newtext(basename), 'punycode', None))
+ basename = basename.replace('-', '_')
+ return 'PyInitU_%s' % (basename,)
initfunctype = lltype.Ptr(lltype.FuncType([], PyObject))
diff --git a/pypy/module/cpyext/state.py b/pypy/module/cpyext/state.py
--- a/pypy/module/cpyext/state.py
+++ b/pypy/module/cpyext/state.py
@@ -143,10 +143,12 @@
argv = space.sys.get('argv')
if space.len_w(argv):
argv0 = space.getitem(argv, space.newint(0))
- progname = space.utf8_w(argv0).decode('utf8')
+ progname = space.utf8_w(argv0)
+ lgt = space.len_w(argv0)
else:
- progname = u"pypy3"
- self.programname = rffi.unicode2wcharp(progname)
+ progname = "pypy3"
+ lgt = len(progname)
+ self.programname = rffi.utf82wcharp(progname, lgt)
lltype.render_immortal(self.programname)
return self.programname
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -358,9 +358,10 @@
if not get_wbuffer(ref):
# Copy unicode buffer
w_unicode = from_ref(space, rffi.cast(PyObject, ref))
- u = space.utf8_w(w_unicode).decode('utf8')
- set_wbuffer(ref, rffi.unicode2wcharp(u))
- set_wsize(ref, len(u))
+ u = space.utf8_w(w_unicode)
+ lgt = space.len_w(w_unicode)
+ set_wbuffer(ref, rffi.utf82wcharp(u, lgt))
+ set_wsize(ref, lgt)
if psize:
psize[0] = get_wsize(ref)
return get_wbuffer(ref)
@@ -950,19 +951,21 @@
than, equal, and greater than, respectively. It is best to pass only
ASCII-encoded strings, but the function interprets the input string as
ISO-8859-1 if it contains non-ASCII characters."""
- uni = space.utf8_w(w_uni).decode('utf8')
+ utf8 = space.utf8_w(w_uni)
+ lgt = space.len_w(w_uni)
i = 0
# Compare Unicode string and source character set string
- while i < len(uni) and string[i] != '\0':
- u = ord(uni[i])
+ for ch in rutf8.Utf8StringIterator(utf8):
+ if string[i] == '\0':
+ break
s = ord(string[i])
- if u != s:
- if u < s:
+ if ch != s:
+ if ch < s:
return -1
else:
return 1
i += 1
- if i < len(uni):
+ if i < lgt:
return 1 # uni is longer
if string[i] != '\0':
return -1 # str is longer
@@ -1061,14 +1064,6 @@
@cpython_api([PyObject, Py_ssize_t, Py_ssize_t], PyObject)
def PyUnicode_Substring(space, w_str, start, end):
- usrc = space.utf8_w(w_str).decode('utf8')
- length = len(usrc)
- if start < 0 or end < 0:
- raise oefmt(space.w_IndexError, "string index out of range")
- if start >= length or end < start:
- result = u''
- else:
- if end > length:
- end = length
- result = usrc[start:end]
- return space.newtext(result)
+ return space.call_method(w_str, '__getitem__',
+ space.newslice(space.newint(start), space.newint(end),
+ space.newint(1)))
diff --git a/pypy/module/struct/formatiterator.py b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -164,6 +164,7 @@
elif isinstance(value, str):
w_value = self.space.newbytes(value)
elif isinstance(value, unicode):
+ assert not isinstance(value, unicode)
w_value = self.space.newutf8(value.decode('utf-8'), len(value))
elif isinstance(value, bool):
w_value = self.space.newbool(value)
More information about the pypy-commit
mailing list