[pypy-commit] pypy unicode-utf8-py3: fixes from trying pypy/module/_codecs/test

Sun Jun 17 03:35:35 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94773:bdc01ece02b0
Date: 2018-06-17 00:34 -0700
http://bitbucket.org/pypy/pypy/changeset/bdc01ece02b0/

Log:	fixes from trying pypy/module/_codecs/test

diff --git a/pypy/interpreter/astcompiler/validate.py b/pypy/interpreter/astcompiler/validate.py
--- a/pypy/interpreter/astcompiler/validate.py
+++ b/pypy/interpreter/astcompiler/validate.py
@@ -409,7 +409,7 @@
     def visit_Str(self, node):
         space = self.space
         w_type = space.type(node.s)
-        if w_type != space.w_str:
+        if w_type != space.w_unicode:
             raise oefmt(space.w_TypeError, "non-string type in Str")
 
     def visit_Bytes(self, node):
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -883,7 +883,7 @@
             u = s.decode('utf-8')
         except UnicodeDecodeError:
             return None
-        return self.interned_strings.get(u)   # may be None
+        return self.interned_strings.get(s)   # may be None
 
     @specialize.arg(1)
     def descr_self_interp_w(self, RequiredClass, w_obj):
@@ -1718,10 +1718,10 @@
     def utf8_0_w(self, w_obj):
         "Like utf8_w, but rejects strings with NUL bytes."
         from rpython.rlib import rstring
-        result = w_obj.utf8_w(self).decode('utf8')
-        if u'\x00' in result:
+        result = w_obj.utf8_w(self)
+        if '\x00' in result:
             raise oefmt(self.w_ValueError,
-                        "argument must be a unicode string without NUL "
+                        "argument must be a utf8 string without NUL "
                         "characters")
         return rstring.assert_str0(result)
 
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -23,6 +23,17 @@
 def decode_utf8(u):
     return str_decode_utf8(u, "strict", True, None)
 
+def test_encode_utf8():
+    space = FakeSpace()
+    assert encode_utf8(space, u"abc") == "abc"
+    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
+    py.test.raises(Hit, encode_utf8, space, u"\ud800")
+    py.test.raises(Hit, encode_utf8, space, u"\udc00")
+    # for the following test, go to lengths to avoid CPython's optimizer
+    # and .pyc file storage, which collapse the two surrogates into one
+    c = u"\udc00"
+    py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
+
 def test_encode_utf8_allow_surrogates():
     sp = FakeSpace()
     assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -168,8 +168,9 @@
     result, consumed = runicode.str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
-        allow_surrogates=allow_surrogates)
-    return result
+        # XXX handle surrogates
+        allow_surrogates=False)
+    return len(result)
 
 def str_decode_ascii(s, errors, final, errorhandler):
     try:
@@ -1211,8 +1212,8 @@
 
 def str_decode_utf_32(s, errors, final=True,
                       errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
-        s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
+    result, c, lgt, _ = str_decode_utf_32_helper(
+        s, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
         allow_surrogates=False)
     return result, c, lgt
 
diff --git a/pypy/module/time/interp_time.py b/pypy/module/time/interp_time.py
--- a/pypy/module/time/interp_time.py
+++ b/pypy/module/time/interp_time.py
@@ -5,7 +5,7 @@
 from pypy.interpreter.gateway import unwrap_spec
 from pypy.interpreter.timeutils import (
     SECS_TO_NS, MS_TO_NS, US_TO_NS, monotonic as _monotonic, timestamp_w)
-from pypy.interpreter.unicodehelper import decode_utf8, encode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from rpython.rtyper.lltypesystem import lltype
 from rpython.rlib.rarithmetic import (
     intmask, r_ulonglong, r_longfloat, widen, ovfcheck, ovfcheck_float_to_int)
@@ -554,7 +554,7 @@
 
     if HAS_TM_ZONE:
         # CPython calls PyUnicode_DecodeLocale here should we do the same?
-        tm_zone = decode_utf8(space, rffi.charp2str(t.c_tm_zone),
+        tm_zone = str_decode_utf8(rffi.charp2str(t.c_tm_zone),
                               allow_surrogates=True)
         extra = [space.newtext(tm_zone),
                  space.newint(rffi.getintfield(t, 'c_tm_gmtoff'))]
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1183,54 +1183,43 @@
 
     # we should implement the same shortcuts as we do for BytesDictStrategy
 
-    def decodekey_str(self, key):
-        return str_decode_utf8(key, "string", True, None, allow_surrogates=True)[0]
+    ## def setitem_str(self, w_dict, key, w_value):
+    ##     assert key is not None
+    ##     self.unerase(w_dict.dstorage)[key] = w_value
 
-    def setitem_str(self, w_dict, key, w_value):
-        assert key is not None
-        self.unerase(w_dict.dstorage)[self.decodekey_str(key)] = w_value
+    ## def getitem(self, w_dict, w_key):
+    ##     space = self.space
+    ##     # -- This is called extremely often.  Hack for performance --
+    ##     if type(w_key) is space.StringObjectCls:
+    ##         return self.getitem_str(w_dict, w_key.unwrap(space))
+    ##     # -- End of performance hack --
+    ##     return AbstractTypedStrategy.getitem(self, w_dict, w_key)
 
-    def getitem(self, w_dict, w_key):
-        space = self.space
-        # -- This is called extremely often.  Hack for performance --
-        if type(w_key) is space.UnicodeObjectCls:
-            return self.unerase(w_dict.dstorage).get(w_key.unwrap(space), None)
-        # -- End of performance hack --
-        return AbstractTypedStrategy.getitem(self, w_dict, w_key)
-
-    def getitem_str(self, w_dict, key):
-        assert key is not None
-        return self.unerase(w_dict.dstorage).get(self.decodekey_str(key), None)
+    ## def getitem_str(self, w_dict, key):
+    ##     assert key is not None
+    ##     return self.unerase(w_dict.dstorage).get(key, None)
 
     def listview_utf8(self, w_dict):
         return self.unerase(w_dict.dstorage).keys()
 
-    def w_keys(self, w_dict):
-        return self.space.newlist_unicode(self.listview_unicode(w_dict))
+    ## def w_keys(self, w_dict):
+    ##     return self.space.newlist_bytes(self.listview_bytes(w_dict))
 
     def wrapkey(space, key):
         return space.newutf8(key, len(key))
 
-    @jit.look_inside_iff(lambda self, w_dict:
-                         w_dict_unrolling_heuristic(w_dict))
-    def view_as_kwargs(self, w_dict):
-        d = self.unerase(w_dict.dstorage)
-        l = len(d)
-        keys, values = [None] * l, [None] * l
-        i = 0
-        for key, val in d.iteritems():
-            keys[i] = key.encode('utf-8')
-            values[i] = val
-            i += 1
-        return keys, values
-
-    def get_storage_fromkeys(self, keys_w, w_fill):
-        """Return an initialized storage with keys and fill values"""
-        storage = {}
-        mark_dict_non_null(storage)
-        for key in keys_w:
-            storage[key] = w_fill
-        return self.erase(storage)
+    ## @jit.look_inside_iff(lambda self, w_dict:
+    ##                      w_dict_unrolling_heuristic(w_dict))
+    ## def view_as_kwargs(self, w_dict):
+    ##     d = self.unerase(w_dict.dstorage)
+    ##     l = len(d)
+    ##     keys, values = [None] * l, [None] * l
+    ##     i = 0
+    ##     for key, val in d.iteritems():
+    ##         keys[i] = key
+    ##         values[i] = val
+    ##         i += 1
+    ##     return keys, values
 
 create_iterator_classes(UnicodeDictStrategy)
 
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -387,7 +387,7 @@
     if is_unicode:
         template = unicode_template_formatter(space,
                                               space.utf8_w(w_string))
-        r = template.build(args)
+        r = template.build(args, w_kwargs)
         lgt = rutf8.check_utf8(r, True)
         return space.newutf8(r, lgt)
     else:
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -159,13 +159,7 @@
             else:
                 return self.newint(x)
         if isinstance(x, str):
-            # this hack is temporary: look at the comment in
-            # test_stdstdobjspace.test_wrap_string
-            try:
-                unicode_x = x.decode('ascii')
-            except UnicodeDecodeError:
-                return self._wrap_string_old(x)
-            return self.newtext(unicode_x)
+            return self.newtext(x)
         if isinstance(x, unicode):
             x = x.encode('utf8')
             lgt = rutf8.check_utf8(x, True)
@@ -390,11 +384,17 @@
 
     @specialize.argtype(1)
     def newtext(self, s):
-        if isinstance(s, str):
+        if isinstance(s, unicode):
+            s, lgt = s.encode('utf8'), len(s)
+        elif isinstance(s, str):
             s, lgt, chk = str_decode_utf8(s, "string", True, None,
                                            allow_surrogates=True)
-            return W_UnicodeObject(s, lgt)
-        lgt = rutf8.check_utf8(s, True)
+        elif isinstance(s, tuple):
+            # result of decode_utf8
+            s, lgt, chk = s
+        else:
+            # XXX what is s ?
+            lgt = rutf8.check_utf8(s, True)
         return W_UnicodeObject(s, lgt)
 
     def newtext_or_none(self, s):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -90,7 +90,7 @@
     def text_w(self, space):
         try:
             identifier = jit.conditional_call_elidable(
-                                self._utf8, g_encode_utf8, self._value)
+                                self._utf8, g_encode_utf8, self._length)
         except SurrogateError as e:
             raise OperationError(space.w_UnicodeEncodeError,
                     space.newtuple([space.newtext('utf-8'),
@@ -126,13 +126,15 @@
         return True
 
     @staticmethod
-    def convert_arg_to_w_unicode(space, w_other):
+    def convert_arg_to_w_unicode(space, w_other, strict=None):
         if isinstance(w_other, W_UnicodeObject):
             return w_other
         if space.isinstance_w(w_other, space.w_bytes):
             return unicode_from_bytes(space, w_other)
-        raise oefmt(space.w_TypeError,
-                    "Can't convert '%T' object to str implicitly", w_other)
+        if strict:
+            raise oefmt(space.w_TypeError,
+                "%s arg must be None, unicode or str", strict)
+        return unicode_from_encoded_object(space, w_other, None, "strict")
 
     def convert_to_w_unicode(self, space):
         return self
@@ -190,6 +192,8 @@
         if w_object is None:
             w_value = W_UnicodeObject.EMPTY
         else:
+            if w_encoding and w_encoding._utf8 == 'unicode_internal':
+                import pdb;pdb.set_trace()
             encoding, errors = _get_encoding_and_errors(space, w_encoding,
                                                         w_errors)
             if encoding is None and errors is None:
@@ -292,7 +296,8 @@
 
     def descr_eq(self, space, w_other):
         try:
-            res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other)._utf8
+            res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other,
+                                                        strict='__eq__')._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -301,7 +306,8 @@
 
     def descr_ne(self, space, w_other):
         try:
-            res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other)._utf8
+            res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other,
+                                                     strict='__neq__')._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -1056,7 +1062,7 @@
     def _strip(self, space, w_chars, left, right, name='strip'):
         "internal function called by str_xstrip methods"
         value = self._utf8
-        chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+        chars = self.convert_arg_to_w_unicode(space, w_chars)._utf8
 
         lpos = 0
         rpos = len(value)