[pypy-commit] pypy unicode-utf8: fight until the strategies seem to work again for ascii unicode strings at least

Sat Nov 4 18:17:05 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92940:1645f5285398
Date: 2017-11-04 20:32 +0100
http://bitbucket.org/pypy/pypy/changeset/1645f5285398/

Log:	fight until the strategies seem to work again for ascii unicode
	strings at least

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -3,7 +3,7 @@
 
 from rpython.rlib.cache import Cache
 from rpython.tool.uid import HUGEVAL_BYTES
-from rpython.rlib import jit, types
+from rpython.rlib import jit, types, rutf8
 from rpython.rlib.debug import make_sure_not_resized
 from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
      compute_unique_id, specialize, not_rpython)
@@ -1084,8 +1084,12 @@
     def newlist_bytes(self, list_s):
         return self.newlist([self.newbytes(s) for s in list_s])
 
-    def newlist_unicode(self, list_u):
-        return self.newlist([self.newunicode(u) for u in list_u])
+    def newlist_utf8(self, list_u, is_ascii):
+        l_w = [None] * len(list_u)
+        for i, item in enumerate(list_u):
+            length, flag = rutf8.check_utf8(item, True)
+            l_w[i] = self.newutf8(item, length, flag)
+        return self.newlist(l_w)
 
     def newlist_int(self, list_i):
         return self.newlist([self.newint(i) for i in list_i])
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1,6 +1,6 @@
 """The builtin dict implementation"""
 
-from rpython.rlib import jit, rerased, objectmodel
+from rpython.rlib import jit, rerased, objectmodel, rutf8
 from rpython.rlib.debug import mark_dict_non_null
 from rpython.rlib.objectmodel import newlist_hint, r_dict, specialize
 from rpython.tool.sourcetools import func_renamer, func_with_new_name
@@ -441,7 +441,7 @@
                     popitem delitem clear \
                     length w_keys values items \
                     iterkeys itervalues iteritems \
-                    listview_bytes listview_unicode listview_int \
+                    listview_bytes listview_utf8 listview_int \
                     view_as_kwargs".split()
 
     def make_method(method):
@@ -593,7 +593,7 @@
     def listview_bytes(self, w_dict):
         return None
 
-    def listview_unicode(self, w_dict):
+    def listview_utf8(self, w_dict):
         return None
 
     def listview_int(self, w_dict):
@@ -640,7 +640,7 @@
         if type(w_key) is self.space.StringObjectCls:
             self.switch_to_bytes_strategy(w_dict)
             return
-        elif type(w_key) is self.space.UnicodeObjectCls:
+        elif type(w_key) is self.space.UnicodeObjectCls and w_key.is_ascii():
             self.switch_to_unicode_strategy(w_dict)
             return
         w_type = self.space.type(w_key)
@@ -1197,14 +1197,14 @@
     unerase = staticmethod(unerase)
 
     def wrap(self, unwrapped):
-        return self.space.newunicode(unwrapped)
+        return self.space.newutf8(unwrapped, len(unwrapped), rutf8.FLAG_ASCII)
 
     def unwrap(self, wrapped):
-        return self.space.unicode_w(wrapped)
+        return self.space.utf8_w(wrapped)
 
     def is_correct_type(self, w_obj):
         space = self.space
-        return space.is_w(space.type(w_obj), space.w_unicode)
+        return type(w_obj) is space.UnicodeObjectCls and w_obj.is_ascii()
 
     def get_empty_storage(self):
         res = {}
@@ -1232,14 +1232,14 @@
     ##     assert key is not None
     ##     return self.unerase(w_dict.dstorage).get(key, None)
 
-    def listview_unicode(self, w_dict):
+    def listview_utf8(self, w_dict):
         return self.unerase(w_dict.dstorage).keys()
 
     ## def w_keys(self, w_dict):
     ##     return self.space.newlist_bytes(self.listview_bytes(w_dict))
 
     def wrapkey(space, key):
-        return space.newunicode(key)
+        return space.newutf8(key, len(key), rutf8.FLAG_ASCII)
 
     ## @jit.look_inside_iff(lambda self, w_dict:
     ##                      w_dict_unrolling_heuristic(w_dict))
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -196,7 +196,7 @@
         return W_ListObject.from_storage_and_strategy(space, storage, strategy)
 
     @staticmethod
-    def newlist_unicode(space, list_u):
+    def newlist_utf8(space, list_u):
         strategy = space.fromcache(UnicodeListStrategy)
         storage = strategy.erase(list_u)
         return W_ListObject.from_storage_and_strategy(space, storage, strategy)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -309,19 +309,10 @@
 
     newlist_text = newlist_bytes
 
-    def newlist_unicode(self, list_u):
-        xxx
-        return self.newlist(list_u)
-        return W_ListObject.newlist_unicode(self, list_u)
-
-    def newlist_utf8(self, lst):
-        res_w = []
-        for utf in lst:
-            assert utf is not None
-            assert isinstance(utf, str)
-            length, flag = rutf8.check_utf8(utf, allow_surrogates=True)
-            res_w.append(self.newutf8(utf, length, flag))
-        return self.newlist(res_w)
+    def newlist_utf8(self, list_u, is_ascii):
+        if is_ascii:
+            return W_ListObject.newlist_utf8(self, list_u)
+        return ObjSpace.newlist_utf8(self, list_u, False)
 
     def newlist_int(self, list_i):
         return W_ListObject.newlist_int(self, list_i)
@@ -515,9 +506,9 @@
         if type(w_obj) is W_ListObject:
             return w_obj.getitems_utf8()
         if type(w_obj) is W_DictObject:
-            return w_obj.listview_unicode()
+            return w_obj.listview_utf8()
         if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
-            return w_obj.listview_unicode()
+            return w_obj.listview_utf8()
         if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj)
             and w_obj.is_ascii()):
             return w_obj.listview_utf8()
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -12,7 +12,7 @@
 from rpython.rlib.objectmodel import iterkeys_with_hash, contains_with_hash
 from rpython.rlib.objectmodel import setitem_with_hash, delitem_with_hash
 from rpython.rlib.rarithmetic import intmask, r_uint
-from rpython.rlib import rerased, jit
+from rpython.rlib import rerased, jit, rutf8
 
 
 UNROLL_CUTOFF = 5
@@ -86,9 +86,9 @@
         """ If this is a string set return its contents as a list of uwnrapped strings. Otherwise return None. """
         return self.strategy.listview_bytes(self)
 
-    def listview_unicode(self):
+    def listview_utf8(self):
         """ If this is a unicode set return its contents as a list of uwnrapped unicodes. Otherwise return None. """
-        return self.strategy.listview_unicode(self)
+        return self.strategy.listview_utf8(self)
 
     def listview_int(self):
         """ If this is an int set return its contents as a list of uwnrapped ints. Otherwise return None. """
@@ -690,7 +690,7 @@
     def listview_bytes(self, w_set):
         return None
 
-    def listview_unicode(self, w_set):
+    def listview_utf8(self, w_set):
         return None
 
     def listview_int(self, w_set):
@@ -795,8 +795,8 @@
             strategy = self.space.fromcache(IntegerSetStrategy)
         elif type(w_key) is W_BytesObject:
             strategy = self.space.fromcache(BytesSetStrategy)
-        #elif type(w_key) is W_UnicodeObject:
-        #    strategy = self.space.fromcache(UnicodeSetStrategy)
+        elif type(w_key) is W_UnicodeObject and w_key.is_ascii():
+            strategy = self.space.fromcache(UnicodeSetStrategy)
         elif self.space.type(w_key).compares_by_identity():
             strategy = self.space.fromcache(IdentitySetStrategy)
         else:
@@ -1272,11 +1272,11 @@
     def get_empty_dict(self):
         return {}
 
-    def listview_unicode(self, w_set):
+    def listview_utf8(self, w_set):
         return self.unerase(w_set.sstorage).keys()
 
     def is_correct_type(self, w_key):
-        return type(w_key) is W_UnicodeObject
+        return type(w_key) is W_UnicodeObject and w_key.is_ascii()
 
     def may_contain_equal_elements(self, strategy):
         if strategy is self.space.fromcache(IntegerSetStrategy):
@@ -1495,7 +1495,7 @@
 
     def next_entry(self):
         for key in self.iterator:
-            return self.space.newunicode(key)
+            return self.space.newutf8(key, len(key), rutf8.FLAG_ASCII)
         else:
             return None
 
@@ -1636,13 +1636,13 @@
         return
 
     # check for unicode
-    #for w_item in iterable_w:
-    #    if type(w_item) is not W_UnicodeObject:
-    #        break
-    #else:
-    #    w_set.strategy = space.fromcache(UnicodeSetStrategy)
-    #    w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
-    #    return
+    for w_item in iterable_w:
+        if type(w_item) is not W_UnicodeObject or not w_item.is_ascii():
+            break
+    else:
+        w_set.strategy = space.fromcache(UnicodeSetStrategy)
+        w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
+        return
 
     # check for compares by identity
     for w_item in iterable_w:
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -142,11 +142,10 @@
         assert self.space.listview_bytes(w_d) == ["a", "b"]
 
     def test_listview_unicode_dict(self):
-        py.test.skip("listview_unicode disabled")
         w = self.space.wrap
         w_d = self.space.newdict()
         w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
-        assert self.space.listview_unicode(w_d) == [u"a", u"b"]
+        assert self.space.listview_utf8(w_d) == ["a", "b"]
 
     def test_listview_int_dict(self):
         w = self.space.wrap
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -675,10 +675,10 @@
             w_l4 = space.call_method(w_u, "rsplit", space.wrap(" "))
         finally:
             del space.newlist
-        assert space.listview_unicode(w_l) == [u"a", u"b", u"c"]
-        assert space.listview_unicode(w_l2) == [u"a", u"b", u"c"]
-        assert space.listview_unicode(w_l3) == [u"a", u"b", u"c"]
-        assert space.listview_unicode(w_l4) == [u"a", u"b", u"c"]
+        assert space.listview_utf8(w_l) == [u"a", u"b", u"c"]
+        assert space.listview_utf8(w_l2) == [u"a", u"b", u"c"]
+        assert space.listview_utf8(w_l3) == [u"a", u"b", u"c"]
+        assert space.listview_utf8(w_l4) == [u"a", u"b", u"c"]
 
     def test_pop_without_argument_is_fast(self):
         space = self.space
@@ -722,7 +722,7 @@
     def test_listview_unicode_list(self):
         space = self.space
         w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
-        assert self.space.listview_unicode(w_l) == [u"a", u"b"]
+        assert self.space.listview_utf8(w_l) == [u"a", u"b"]
 
     def test_listview_int_list(self):
         space = self.space
diff --git a/pypy/objspace/std/test/test_setstrategies.py b/pypy/objspace/std/test/test_setstrategies.py
--- a/pypy/objspace/std/test/test_setstrategies.py
+++ b/pypy/objspace/std/test/test_setstrategies.py
@@ -42,7 +42,6 @@
         assert s1.strategy is self.space.fromcache(ObjectSetStrategy)
 
     def test_switch_to_unicode(self):
-        py.test.skip("disabled")
         s = W_SetObject(self.space, self.wrapped([]))
         s.add(self.space.wrap(u"six"))
         assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -28,7 +28,7 @@
 
     def test_listview_unicode(self):
         w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
-        assert self.space.listview_unicode(w_str) == list(u"abcd")
+        assert self.space.listview_utf8(w_str) == list("abcd")
 
     def test_new_shortcut(self):
         space = self.space
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -196,10 +196,6 @@
     def _islinebreak(self, s, pos):
         return rutf8.islinebreak(s, pos)
 
-    def _newlist_unwrapped(self, space, lst):
-        assert False, "should not be called"
-        return space.newlist_unicode(lst)
-
     @staticmethod
     @unwrap_spec(w_string=WrappedDefault(""))
     def descr_new(space, w_unicodetype, w_string, w_encoding=None,
@@ -503,11 +499,11 @@
     _StringMethods_descr_join = descr_join
     def descr_join(self, space, w_list):
         l = space.listview_utf8(w_list)
-        if l is not None:
-            xxxx
+        if l is not None and self.is_ascii():
             if len(l) == 1:
-                return space.newunicode(l[0])
-            return space.newunicode(self._utf8).join(l)
+                return space.newutf8(l[0], len(l[0]), rutf8.FLAG_ASCII)
+            s = self._utf8.join(l)
+            return space.newutf8(s, len(s), rutf8.FLAG_ASCII)
         return self._StringMethods_descr_join(space, w_list)
 
     def _join_return_one(self, space, w_obj):
@@ -755,14 +751,14 @@
         value = self._utf8
         if space.is_none(w_sep):
             res = split(value, maxsplit=maxsplit, isutf8=True)
-            return space.newlist_utf8(res)
+            return space.newlist_utf8(res, self.is_ascii())
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = split(value, by, maxsplit, isutf8=True)
 
-        return space.newlist_utf8(res)
+        return space.newlist_utf8(res, self.is_ascii())
 
     @unwrap_spec(maxsplit=int)
     def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
@@ -770,14 +766,14 @@
         value = self._utf8
         if space.is_none(w_sep):
             res = rsplit(value, maxsplit=maxsplit, isutf8=True)
-            return space.newlist_utf8(res)
+            return space.newlist_utf8(res, self.is_ascii())
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = rsplit(value, by, maxsplit, isutf8=True)
 
-        return space.newlist_utf8(res)
+        return space.newlist_utf8(res, self.is_ascii())
 
     def descr_getitem(self, space, w_index):
         if isinstance(w_index, W_SliceObject):