[pypy-commit] pypy unicode-utf8-py3: remove more bytes.decode, assert on unused case in module.struct

Sat Feb 2 12:37:20 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95777:f165c244dfb4
Date: 2019-02-02 19:06 +0200
http://bitbucket.org/pypy/pypy/changeset/f165c244dfb4/

Log:	remove more bytes.decode, assert on unused case in module.struct

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -11,6 +11,8 @@
   - rutf8.utf8_encode_mbcs
   - unicodehelper.fsencode
   - unicodehelper.unicode_to_decimal_w
+  - _winreg.inerp_winreg
+* remove 'assert not isinstance(*, unicode)
 * remove asserts from _WIN32 paths in rlib.rposix.re{name,place}
 * convert all realunicode_w to unicode_w after we flush out all old uses of
   unicode_w
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -65,14 +65,9 @@
 
 def verify_identifier(token):
     # 1=ok; 0=not an identifier; -1=bad utf-8
-    for c in token:
-        if ord(c) >= 0x80:
-            break
-    else:
-        return 1
     try:
-        u = token.decode('utf-8')
-    except UnicodeDecodeError:
+        rutf8.check_utf8(token, False)
+    except rutf8.CheckError:
         return -1
     from pypy.objspace.std.unicodeobject import _isidentifier
     return _isidentifier(token)
diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py
--- a/pypy/module/cpyext/api.py
+++ b/pypy/module/cpyext/api.py
@@ -14,7 +14,7 @@
 from rpython.rlib.rfile import (FILEP, c_fread, c_fclose, c_fwrite,
         c_fdopen, c_fileno,
         c_fopen)# for tests
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.translator import cdir
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
 from rpython.translator.gensupp import NameManager
@@ -1722,17 +1722,14 @@
     raise_import_error(space, space.newtext(msg), w_name, w_path)
 
 def get_init_name(space, w_name):
-    name_u = space.utf8_w(w_name).decode('utf8')
-    basename_u = name_u.split(u'.')[-1]
-    try:
-        basename = basename_u.encode('ascii')
+    name = space.utf8_w(w_name)
+    basename = name.split('.')[-1]
+    if rutf8.first_non_ascii_char(basename) == -1:
         return 'PyInit_%s' % (basename,)
-    except UnicodeEncodeError:
-        basename = space.bytes_w(encode_object(
-            space, space.newtext(basename_u), 'punycode', None))
-        basename = basename.replace('-', '_')
-        return 'PyInitU_%s' % (basename,)
-
+    basename = space.bytes_w(encode_object(
+        space, space.newtext(basename), 'punycode', None))
+    basename = basename.replace('-', '_')
+    return 'PyInitU_%s' % (basename,)
 
 initfunctype = lltype.Ptr(lltype.FuncType([], PyObject))
 
diff --git a/pypy/module/cpyext/state.py b/pypy/module/cpyext/state.py
--- a/pypy/module/cpyext/state.py
+++ b/pypy/module/cpyext/state.py
@@ -143,10 +143,12 @@
             argv = space.sys.get('argv')
             if space.len_w(argv):
                 argv0 = space.getitem(argv, space.newint(0))
-                progname = space.utf8_w(argv0).decode('utf8')
+                progname = space.utf8_w(argv0)
+                lgt = space.len_w(argv0)
             else:
-                progname = u"pypy3"
-            self.programname = rffi.unicode2wcharp(progname)
+                progname = "pypy3"
+                lgt = len(progname)
+            self.programname = rffi.utf82wcharp(progname, lgt)
             lltype.render_immortal(self.programname)
         return self.programname
 
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -358,9 +358,10 @@
     if not get_wbuffer(ref):
         # Copy unicode buffer
         w_unicode = from_ref(space, rffi.cast(PyObject, ref))
-        u = space.utf8_w(w_unicode).decode('utf8')
-        set_wbuffer(ref, rffi.unicode2wcharp(u))
-        set_wsize(ref, len(u))
+        u = space.utf8_w(w_unicode)
+        lgt = space.len_w(w_unicode)
+        set_wbuffer(ref, rffi.utf82wcharp(u, lgt))
+        set_wsize(ref, lgt)
     if psize:
         psize[0] = get_wsize(ref)
     return get_wbuffer(ref)
@@ -950,19 +951,21 @@
     than, equal, and greater than, respectively. It is best to pass only
     ASCII-encoded strings, but the function interprets the input string as
     ISO-8859-1 if it contains non-ASCII characters."""
-    uni = space.utf8_w(w_uni).decode('utf8')
+    utf8 = space.utf8_w(w_uni)
+    lgt = space.len_w(w_uni)
     i = 0
     # Compare Unicode string and source character set string
-    while i < len(uni) and string[i] != '\0':
-        u = ord(uni[i])
+    for ch in rutf8.Utf8StringIterator(utf8):
+        if string[i] == '\0':
+            break
         s = ord(string[i])
-        if u != s:
-            if u < s:
+        if ch != s:
+            if ch < s:
                 return -1
             else:
                 return 1
         i += 1
-    if i < len(uni):
+    if i < lgt:
         return 1  # uni is longer
     if string[i] != '\0':
         return -1  # str is longer
@@ -1061,14 +1064,6 @@
 
 @cpython_api([PyObject, Py_ssize_t, Py_ssize_t], PyObject)
 def PyUnicode_Substring(space, w_str, start, end):
-    usrc = space.utf8_w(w_str).decode('utf8')
-    length = len(usrc)
-    if start < 0 or end < 0:
-        raise oefmt(space.w_IndexError, "string index out of range")
-    if start >= length or end < start:
-        result = u''
-    else:
-        if end > length:
-            end = length
-        result = usrc[start:end]
-    return space.newtext(result)
+    return space.call_method(w_str, '__getitem__',
+                         space.newslice(space.newint(start), space.newint(end),
+                                        space.newint(1)))
diff --git a/pypy/module/struct/formatiterator.py b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -164,6 +164,7 @@
         elif isinstance(value, str):
             w_value = self.space.newbytes(value)
         elif isinstance(value, unicode):
+            assert not isinstance(value, unicode)
             w_value = self.space.newutf8(value.decode('utf-8'), len(value))
         elif isinstance(value, bool):
             w_value = self.space.newbool(value)