[pypy-commit] pypy unicode-utf8-py3: disallow tuple input to newtext, and also refactor some unicode/utf8 recoding

Sun Dec 2 11:52:11 EST 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95399:5a6b7f57a324
Date: 2018-11-29 22:08 -0800
http://bitbucket.org/pypy/pypy/changeset/5a6b7f57a324/

Log:	disallow tuple input to newtext, and also refactor some unicode/utf8
	recoding

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -115,7 +115,7 @@
             return W_FString(substr, rawmode)
         else:
             v = unicodehelper.str_decode_utf8(substr, 'strict', True, None)
-            return space.newtext(v)
+            return space.newtext(*v)
 
     v = PyString_DecodeEscape(space, substr, 'strict', encoding)
     return space.newbytes(v)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -771,7 +771,7 @@
         self._check_closed(space)
         self._writeflush(space)
         limit = convert_size(space, w_limit)
-        return space.newtext(self._readline(space, limit))
+        return space.newtext(*self._readline(space, limit))
 
     def _readline(self, space, limit):
         # This is a separate function so that readline_w() can be jitted.
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1154,7 +1154,7 @@
                     raise oefmt(space.w_ValueError,
                                 "array contains a unicode character out of "
                                 "range(0x110000)")
-                return space.newtext(item)
+                return space.newtext(rutf8.unichr_as_utf8(ord(item)), 1)
             assert 0, "unreachable"
 
         # interface
diff --git a/pypy/module/time/interp_time.py b/pypy/module/time/interp_time.py
--- a/pypy/module/time/interp_time.py
+++ b/pypy/module/time/interp_time.py
@@ -459,8 +459,8 @@
 
     _set_module_object(space, "timezone", space.newint(timezone))
     _set_module_object(space, 'daylight', space.newint(daylight))
-    tzname_w = [space.newtext(tzname[0].decode('latin-1')),
-                space.newtext(tzname[1].decode('latin-1'))]
+    tzname_w = [space.newtext(tzname[0]),
+                space.newtext(tzname[1])]
     _set_module_object(space, 'tzname', space.newtuple(tzname_w))
     _set_module_object(space, 'altzone', space.newint(altzone))
 
diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -371,9 +371,9 @@
     m.atom_str(TYPE_STRING, x.co_code)
     _marshal_tuple(space, x.co_consts_w, m)
     _marshal_tuple(space, x.co_names_w, m)   # list of w_unicodes
-    co_varnames_w = [space.newtext(_decode_utf8(space, s)) for s in x.co_varnames]
-    co_freevars_w = [space.newtext(_decode_utf8(space, s)) for s in x.co_freevars]
-    co_cellvars_w = [space.newtext(_decode_utf8(space, s)) for s in x.co_cellvars]
+    co_varnames_w = [space.newtext(*_decode_utf8(space, s)) for s in x.co_varnames]
+    co_freevars_w = [space.newtext(*_decode_utf8(space, s)) for s in x.co_freevars]
+    co_cellvars_w = [space.newtext(*_decode_utf8(space, s)) for s in x.co_cellvars]
     _marshal_tuple(space, co_varnames_w, m)  # more lists, now of w_unicodes
     _marshal_tuple(space, co_freevars_w, m)
     _marshal_tuple(space, co_cellvars_w, m)
@@ -451,7 +451,7 @@
 @unmarshaller(TYPE_UNICODE)
 def unmarshal_unicode(space, u, tc):
     uc = _decode_utf8(space, u.get_str())
-    return space.newtext(uc)
+    return space.newtext(*uc)
 
 @unmarshaller(TYPE_INTERNED)
 def unmarshal_interned(space, u, tc):
@@ -464,7 +464,7 @@
     else:
         lng = u.get_lng()
     s = u.get(lng)
-    w_u = u.space.newtext(s.decode('latin-1'))
+    w_u = u.space.newtext(s)
     if interned:
         w_u = u.space.new_interned_w_str(w_u)
     return w_u
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -380,16 +380,16 @@
     def newbytearray(self, l):
         return W_BytearrayObject(l)
 
+    # XXX TODO - remove this and force all users to call with utf8
     @specialize.argtype(1)
-    def newtext(self, s, lgt=-1):
+    def newtext(self, s, lgt=-1, unused=-1):
+        # the unused argument can be from something like
+        # newtext(*decode_utf8sp(space, code))
         if isinstance(s, unicode):
             s, lgt = s.encode('utf8'), len(s)
-        elif isinstance(s, str) and lgt < 0:
+        assert isinstance(s, str)
+        if lgt < 0:
             lgt = rutf8.codepoints_in_utf8(s)
-        elif isinstance(s, tuple):
-            # result of decode_utf8
-            s, lgt, codepoints = s
-        assert isinstance(s, str)
         return W_UnicodeObject(s, lgt)
 
     def newtext_or_none(self, s, lgt=-1):