[pypy-commit] pypy unicode-utf8: whack enough to get somewhere with the list strategy - just for ascii-unicode so far

Sat Nov 4 18:17:03 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92939:0aeb46cc86b0
Date: 2017-11-04 19:37 +0100
http://bitbucket.org/pypy/pypy/changeset/0aeb46cc86b0/

Log:	whack enough to get somewhere with the list strategy - just for
	ascii-unicode so far

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1054,7 +1054,7 @@
         """
         return None
 
-    def listview_unicode(self, w_list):
+    def listview_utf8(self, w_list):
         """ Return a list of unwrapped unicode out of a list of unicode. If the
         argument is not a list or does not contain only unicode, return None.
         May return None anyway.
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -10,7 +10,7 @@
 import operator
 import sys
 
-from rpython.rlib import debug, jit, rerased
+from rpython.rlib import debug, jit, rerased, rutf8
 from rpython.rlib.listsort import make_timsort_class
 from rpython.rlib.objectmodel import (
     import_from_mixin, instantiate, newlist_hint, resizelist_hint, specialize)
@@ -95,10 +95,11 @@
         else:
             return space.fromcache(BytesListStrategy)
 
-    elif False and type(w_firstobj) is W_UnicodeObject: # disable unicode list strat
+    elif type(w_firstobj) is W_UnicodeObject and w_firstobj.is_ascii():
         # check for all-unicodes
         for i in range(1, len(list_w)):
-            if type(list_w[i]) is not W_UnicodeObject:
+            item = list_w[i]
+            if type(item) is not W_UnicodeObject or not item.is_ascii():
                 break
         else:
             return space.fromcache(UnicodeListStrategy)
@@ -196,7 +197,6 @@
 
     @staticmethod
     def newlist_unicode(space, list_u):
-        xxxx
         strategy = space.fromcache(UnicodeListStrategy)
         storage = strategy.erase(list_u)
         return W_ListObject.from_storage_and_strategy(space, storage, strategy)
@@ -349,10 +349,10 @@
         not use the list strategy, return None."""
         return self.strategy.getitems_bytes(self)
 
-    def getitems_unicode(self):
+    def getitems_utf8(self):
         """Return the items in the list as unwrapped unicodes. If the list does
         not use the list strategy, return None."""
-        return self.strategy.getitems_unicode(self)
+        return self.strategy.getitems_utf8(self)
 
     def getitems_int(self):
         """Return the items in the list as unwrapped ints. If the list does not
@@ -813,7 +813,7 @@
     def getitems_bytes(self, w_list):
         return None
 
-    def getitems_unicode(self, w_list):
+    def getitems_utf8(self, w_list):
         return None
 
     def getitems_int(self, w_list):
@@ -954,8 +954,8 @@
             strategy = self.space.fromcache(IntegerListStrategy)
         elif type(w_item) is W_BytesObject:
             strategy = self.space.fromcache(BytesListStrategy)
-        #elif type(w_item) is W_UnicodeObject:
-        #    strategy = self.space.fromcache(UnicodeListStrategy)
+        elif type(w_item) is W_UnicodeObject and w_item.is_ascii():
+            strategy = self.space.fromcache(UnicodeListStrategy)
         elif type(w_item) is W_FloatObject:
             strategy = self.space.fromcache(FloatListStrategy)
         else:
@@ -1025,9 +1025,8 @@
             w_list.lstorage = strategy.erase(byteslist[:])
             return
 
-        if False:
-          unilist = space.listview_unicode(w_iterable)
-          if unilist is not None:
+        unilist = space.listview_utf8(w_iterable)
+        if unilist is not None:
             w_list.strategy = strategy = space.fromcache(UnicodeListStrategy)
             # need to copy because intlist can share with w_iterable
             w_list.lstorage = strategy.erase(unilist[:])
@@ -1995,11 +1994,11 @@
 class UnicodeListStrategy(ListStrategy):
     import_from_mixin(AbstractUnwrappedStrategy)
 
-    _none_value = u""
+    _none_value = ""
 
     def wrap(self, stringval):
         assert stringval is not None
-        return self.space.newunicode(stringval)
+        return self.space.newutf8(stringval, len(stringval), rutf8.FLAG_ASCII)
 
     def unwrap(self, w_string):
         return self.space.utf8_w(w_string)
@@ -2009,7 +2008,7 @@
     unerase = staticmethod(unerase)
 
     def is_correct_type(self, w_obj):
-        return type(w_obj) is W_UnicodeObject
+        return type(w_obj) is W_UnicodeObject and w_obj.is_ascii()
 
     def list_is_correct_type(self, w_list):
         return w_list.strategy is self.space.fromcache(UnicodeListStrategy)
@@ -2021,7 +2020,7 @@
         if reverse:
             l.reverse()
 
-    def getitems_unicode(self, w_list):
+    def getitems_utf8(self, w_list):
         return self.unerase(w_list.lstorage)
 
 # _______________________________________________________
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -164,7 +164,9 @@
         if isinstance(x, str):
             return self.newtext(x)
         if isinstance(x, unicode):
-            return self.newutf8(x.encode('utf8'), len(x), rutf8.FLAG_REGULAR)
+            from pypy.interpreter import unicodehelper
+            return self.newutf8(x.encode('utf8'), len(x),
+                                unicodehelper._get_flag(x))
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):
@@ -507,20 +509,20 @@
             return w_obj.getitems_bytes()
         return None
 
-    def listview_unicode(self, w_obj):
+    def listview_utf8(self, w_obj):
         # note: uses exact type checking for objects with strategies,
         # and isinstance() for others.  See test_listobject.test_uses_custom...
         if type(w_obj) is W_ListObject:
-            return w_obj.getitems_unicode()
+            return w_obj.getitems_utf8()
         if type(w_obj) is W_DictObject:
             return w_obj.listview_unicode()
         if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
             return w_obj.listview_unicode()
         if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj)
             and w_obj.is_ascii()):
-            return w_obj.listview_unicode()
+            return w_obj.listview_utf8()
         if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
-            return w_obj.getitems_unicode()
+            return w_obj.getitems_utf8()
         return None
 
     def listview_int(self, w_obj):
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -1591,7 +1591,7 @@
         w_set.sstorage = strategy.get_storage_from_unwrapped_list(byteslist)
         return
 
-    unicodelist = space.listview_unicode(w_iterable)
+    unicodelist = space.listview_utf8(w_iterable)
     if unicodelist is not None:
         strategy = space.fromcache(UnicodeSetStrategy)
         w_set.strategy = strategy
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -20,9 +20,9 @@
                           IntegerListStrategy)
         assert isinstance(W_ListObject(space, [wb('a'), wb('b')]).strategy,
                           BytesListStrategy)
-        #assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
-        #                  UnicodeListStrategy)
-        assert isinstance(W_ListObject(space, [space.newutf8('a', 1, 0), wb('b')]).strategy,
+        assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
+                          UnicodeListStrategy)
+        assert isinstance(W_ListObject(space, [w(u'a'), wb('b')]).strategy,
                           ObjectListStrategy) # mixed unicode and bytes
 
     def test_empty_to_any(self):
@@ -47,7 +47,7 @@
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
         l.append(w(u'a'))
-        #assert isinstance(l.strategy, UnicodeListStrategy)
+        assert isinstance(l.strategy, UnicodeListStrategy)
 
         l = W_ListObject(space, [])
         assert isinstance(l.strategy, EmptyListStrategy)
@@ -74,7 +74,6 @@
         assert isinstance(l.strategy, ObjectListStrategy)
 
     def test_unicode_to_any(self):
-        py.test.skip("disabled")
         space = self.space
         l = W_ListObject(space, [space.wrap(u'a'), space.wrap(u'b'), space.wrap(u'c')])
         assert isinstance(l.strategy, UnicodeListStrategy)
@@ -118,7 +117,7 @@
 
         # UnicodeStrategy to ObjectStrategy
         l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
-        #assert isinstance(l.strategy, UnicodeListStrategy)
+        assert isinstance(l.strategy, UnicodeListStrategy)
         l.setitem(0, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -146,7 +145,7 @@
 
         # UnicodeStrategy
         l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
-        #assert isinstance(l.strategy, UnicodeListStrategy)
+        assert isinstance(l.strategy, UnicodeListStrategy)
         l.insert(3, w(2))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -226,7 +225,7 @@
 
         # UnicodeStrategy to ObjectStrategy
         l = W_ListObject(space, [w(u'a'), w(u'b'), w(u'c')])
-        #assert isinstance(l.strategy, UnicodeListStrategy)
+        assert isinstance(l.strategy, UnicodeListStrategy)
         l.setslice(0, 1, 2, W_ListObject(space, [w(1), w(2), w(3)]))
         assert isinstance(l.strategy, ObjectListStrategy)
 
@@ -276,7 +275,7 @@
         l = W_ListObject(space, wrapitems([u"a",u"b",u"c",u"d",u"e"]))
         other = W_ListObject(space, wrapitems([u"a", u"b", u"c"]))
         keep_other_strategy(l, 0, 2, other.length(), other)
-        #assert l.strategy is space.fromcache(UnicodeListStrategy)
+        assert l.strategy is space.fromcache(UnicodeListStrategy)
 
         l = W_ListObject(space, wrapitems([1.1, 2.2, 3.3, 4.4, 5.5]))
         other = W_ListObject(space, [])
@@ -346,7 +345,7 @@
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
         empty.extend(W_ListObject(space, [w(u"a"), w(u"b"), w(u"c")]))
-        #assert isinstance(empty.strategy, UnicodeListStrategy)
+        assert isinstance(empty.strategy, UnicodeListStrategy)
 
         empty = W_ListObject(space, [])
         assert isinstance(empty.strategy, EmptyListStrategy)
@@ -602,7 +601,7 @@
         l1 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newbytes("zwei")])
         assert isinstance(l1.strategy, BytesListStrategy)
         l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"), self.space.newunicode(u"zwei")])
-        #assert isinstance(l2.strategy, UnicodeListStrategy)
+        assert isinstance(l2.strategy, UnicodeListStrategy)
         l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newunicode(u"zwei")])
         assert isinstance(l3.strategy, ObjectListStrategy)
 
@@ -613,11 +612,10 @@
         assert space.listview_bytes(w_l) == ["a", "b"]
 
     def test_listview_unicode(self):
-        py.test.skip("disabled")
         space = self.space
-        assert space.listview_unicode(space.wrap(1)) == None
+        assert space.listview_utf8(space.wrap(1)) == None
         w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
-        assert space.listview_unicode(w_l) == [u"a", u"b"]
+        assert space.listview_utf8(w_l) == ["a", "b"]
 
     def test_string_join_uses_listview_bytes(self):
         space = self.space
@@ -626,7 +624,6 @@
         assert space.str_w(space.call_method(space.wrap("c"), "join", w_l)) == "acb"
         #
         # the same for unicode
-        py.test.skip("disabled")
         w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
         w_l.getitems = None
         assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb"
@@ -639,7 +636,6 @@
         assert space.is_w(space.call_method(space.wrap(" -- "), "join", w_l), w_text)
         #
         # the same for unicode
-        py.test.skip("disabled")
         w_text = space.wrap(u"text")
         w_l = self.space.newlist([w_text])
         w_l.getitems = None
@@ -669,7 +665,6 @@
         assert space.listview_bytes(w_l4) == ["a", "b", "c"]
 
     def test_unicode_uses_newlist_unicode(self):
-        py.test.skip("disabled")
         space = self.space
         w_u = space.wrap(u"a b c")
         space.newlist = None
@@ -725,7 +720,6 @@
         assert self.space.listview_bytes(w_l) == ["a", "b"]
 
     def test_listview_unicode_list(self):
-        py.test.skip("disabled")
         space = self.space
         w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
         assert self.space.listview_unicode(w_l) == [u"a", u"b"]
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -116,7 +116,7 @@
 
     charbuf_w = str_w
 
-    def listview_unicode(self):
+    def listview_utf8(self):
         assert self.is_ascii()
         return _create_list_from_unicode(self._utf8)
 
@@ -502,9 +502,9 @@
 
     _StringMethods_descr_join = descr_join
     def descr_join(self, space, w_list):
-        l = space.listview_unicode(w_list)
+        l = space.listview_utf8(w_list)
         if l is not None:
-            assert False, "unreachable"
+            xxxx
             if len(l) == 1:
                 return space.newunicode(l[0])
             return space.newunicode(self._utf8).join(l)