[pypy-commit] pypy unicode-utf8: fight until the strategies seem to work again for ascii unicode strings at least
fijal
pypy.commits at gmail.com
Sat Nov 4 18:17:05 EDT 2017
Author: fijal
Branch: unicode-utf8
Changeset: r92940:1645f5285398
Date: 2017-11-04 20:32 +0100
http://bitbucket.org/pypy/pypy/changeset/1645f5285398/
Log: fight until the strategies seem to work again for ascii unicode
strings at least
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -3,7 +3,7 @@
from rpython.rlib.cache import Cache
from rpython.tool.uid import HUGEVAL_BYTES
-from rpython.rlib import jit, types
+from rpython.rlib import jit, types, rutf8
from rpython.rlib.debug import make_sure_not_resized
from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
compute_unique_id, specialize, not_rpython)
@@ -1084,8 +1084,12 @@
def newlist_bytes(self, list_s):
return self.newlist([self.newbytes(s) for s in list_s])
- def newlist_unicode(self, list_u):
- return self.newlist([self.newunicode(u) for u in list_u])
+ def newlist_utf8(self, list_u, is_ascii):
+ l_w = [None] * len(list_u)
+ for i, item in enumerate(list_u):
+ length, flag = rutf8.check_utf8(item, True)
+ l_w[i] = self.newutf8(item, length, flag)
+ return self.newlist(l_w)
def newlist_int(self, list_i):
return self.newlist([self.newint(i) for i in list_i])
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1,6 +1,6 @@
"""The builtin dict implementation"""
-from rpython.rlib import jit, rerased, objectmodel
+from rpython.rlib import jit, rerased, objectmodel, rutf8
from rpython.rlib.debug import mark_dict_non_null
from rpython.rlib.objectmodel import newlist_hint, r_dict, specialize
from rpython.tool.sourcetools import func_renamer, func_with_new_name
@@ -441,7 +441,7 @@
popitem delitem clear \
length w_keys values items \
iterkeys itervalues iteritems \
- listview_bytes listview_unicode listview_int \
+ listview_bytes listview_utf8 listview_int \
view_as_kwargs".split()
def make_method(method):
@@ -593,7 +593,7 @@
def listview_bytes(self, w_dict):
return None
- def listview_unicode(self, w_dict):
+ def listview_utf8(self, w_dict):
return None
def listview_int(self, w_dict):
@@ -640,7 +640,7 @@
if type(w_key) is self.space.StringObjectCls:
self.switch_to_bytes_strategy(w_dict)
return
- elif type(w_key) is self.space.UnicodeObjectCls:
+ elif type(w_key) is self.space.UnicodeObjectCls and w_key.is_ascii():
self.switch_to_unicode_strategy(w_dict)
return
w_type = self.space.type(w_key)
@@ -1197,14 +1197,14 @@
unerase = staticmethod(unerase)
def wrap(self, unwrapped):
- return self.space.newunicode(unwrapped)
+ return self.space.newutf8(unwrapped, len(unwrapped), rutf8.FLAG_ASCII)
def unwrap(self, wrapped):
- return self.space.unicode_w(wrapped)
+ return self.space.utf8_w(wrapped)
def is_correct_type(self, w_obj):
space = self.space
- return space.is_w(space.type(w_obj), space.w_unicode)
+ return type(w_obj) is space.UnicodeObjectCls and w_obj.is_ascii()
def get_empty_storage(self):
res = {}
@@ -1232,14 +1232,14 @@
## assert key is not None
## return self.unerase(w_dict.dstorage).get(key, None)
- def listview_unicode(self, w_dict):
+ def listview_utf8(self, w_dict):
return self.unerase(w_dict.dstorage).keys()
## def w_keys(self, w_dict):
## return self.space.newlist_bytes(self.listview_bytes(w_dict))
def wrapkey(space, key):
- return space.newunicode(key)
+ return space.newutf8(key, len(key), rutf8.FLAG_ASCII)
## @jit.look_inside_iff(lambda self, w_dict:
## w_dict_unrolling_heuristic(w_dict))
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -196,7 +196,7 @@
return W_ListObject.from_storage_and_strategy(space, storage, strategy)
@staticmethod
- def newlist_unicode(space, list_u):
+ def newlist_utf8(space, list_u):
strategy = space.fromcache(UnicodeListStrategy)
storage = strategy.erase(list_u)
return W_ListObject.from_storage_and_strategy(space, storage, strategy)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -309,19 +309,10 @@
newlist_text = newlist_bytes
- def newlist_unicode(self, list_u):
- xxx
- return self.newlist(list_u)
- return W_ListObject.newlist_unicode(self, list_u)
-
- def newlist_utf8(self, lst):
- res_w = []
- for utf in lst:
- assert utf is not None
- assert isinstance(utf, str)
- length, flag = rutf8.check_utf8(utf, allow_surrogates=True)
- res_w.append(self.newutf8(utf, length, flag))
- return self.newlist(res_w)
+ def newlist_utf8(self, list_u, is_ascii):
+ if is_ascii:
+ return W_ListObject.newlist_utf8(self, list_u)
+ return ObjSpace.newlist_utf8(self, list_u, False)
def newlist_int(self, list_i):
return W_ListObject.newlist_int(self, list_i)
@@ -515,9 +506,9 @@
if type(w_obj) is W_ListObject:
return w_obj.getitems_utf8()
if type(w_obj) is W_DictObject:
- return w_obj.listview_unicode()
+ return w_obj.listview_utf8()
if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
- return w_obj.listview_unicode()
+ return w_obj.listview_utf8()
if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj)
and w_obj.is_ascii()):
return w_obj.listview_utf8()
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -12,7 +12,7 @@
from rpython.rlib.objectmodel import iterkeys_with_hash, contains_with_hash
from rpython.rlib.objectmodel import setitem_with_hash, delitem_with_hash
from rpython.rlib.rarithmetic import intmask, r_uint
-from rpython.rlib import rerased, jit
+from rpython.rlib import rerased, jit, rutf8
UNROLL_CUTOFF = 5
@@ -86,9 +86,9 @@
""" If this is a string set return its contents as a list of uwnrapped strings. Otherwise return None. """
return self.strategy.listview_bytes(self)
- def listview_unicode(self):
+ def listview_utf8(self):
""" If this is a unicode set return its contents as a list of uwnrapped unicodes. Otherwise return None. """
- return self.strategy.listview_unicode(self)
+ return self.strategy.listview_utf8(self)
def listview_int(self):
""" If this is an int set return its contents as a list of uwnrapped ints. Otherwise return None. """
@@ -690,7 +690,7 @@
def listview_bytes(self, w_set):
return None
- def listview_unicode(self, w_set):
+ def listview_utf8(self, w_set):
return None
def listview_int(self, w_set):
@@ -795,8 +795,8 @@
strategy = self.space.fromcache(IntegerSetStrategy)
elif type(w_key) is W_BytesObject:
strategy = self.space.fromcache(BytesSetStrategy)
- #elif type(w_key) is W_UnicodeObject:
- # strategy = self.space.fromcache(UnicodeSetStrategy)
+ elif type(w_key) is W_UnicodeObject and w_key.is_ascii():
+ strategy = self.space.fromcache(UnicodeSetStrategy)
elif self.space.type(w_key).compares_by_identity():
strategy = self.space.fromcache(IdentitySetStrategy)
else:
@@ -1272,11 +1272,11 @@
def get_empty_dict(self):
return {}
- def listview_unicode(self, w_set):
+ def listview_utf8(self, w_set):
return self.unerase(w_set.sstorage).keys()
def is_correct_type(self, w_key):
- return type(w_key) is W_UnicodeObject
+ return type(w_key) is W_UnicodeObject and w_key.is_ascii()
def may_contain_equal_elements(self, strategy):
if strategy is self.space.fromcache(IntegerSetStrategy):
@@ -1495,7 +1495,7 @@
def next_entry(self):
for key in self.iterator:
- return self.space.newunicode(key)
+ return self.space.newutf8(key, len(key), rutf8.FLAG_ASCII)
else:
return None
@@ -1636,13 +1636,13 @@
return
# check for unicode
- #for w_item in iterable_w:
- # if type(w_item) is not W_UnicodeObject:
- # break
- #else:
- # w_set.strategy = space.fromcache(UnicodeSetStrategy)
- # w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
- # return
+ for w_item in iterable_w:
+ if type(w_item) is not W_UnicodeObject or not w_item.is_ascii():
+ break
+ else:
+ w_set.strategy = space.fromcache(UnicodeSetStrategy)
+ w_set.sstorage = w_set.strategy.get_storage_from_list(iterable_w)
+ return
# check for compares by identity
for w_item in iterable_w:
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -142,11 +142,10 @@
assert self.space.listview_bytes(w_d) == ["a", "b"]
def test_listview_unicode_dict(self):
- py.test.skip("listview_unicode disabled")
w = self.space.wrap
w_d = self.space.newdict()
w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
- assert self.space.listview_unicode(w_d) == [u"a", u"b"]
+ assert self.space.listview_utf8(w_d) == ["a", "b"]
def test_listview_int_dict(self):
w = self.space.wrap
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -675,10 +675,10 @@
w_l4 = space.call_method(w_u, "rsplit", space.wrap(" "))
finally:
del space.newlist
- assert space.listview_unicode(w_l) == [u"a", u"b", u"c"]
- assert space.listview_unicode(w_l2) == [u"a", u"b", u"c"]
- assert space.listview_unicode(w_l3) == [u"a", u"b", u"c"]
- assert space.listview_unicode(w_l4) == [u"a", u"b", u"c"]
+ assert space.listview_utf8(w_l) == [u"a", u"b", u"c"]
+ assert space.listview_utf8(w_l2) == [u"a", u"b", u"c"]
+ assert space.listview_utf8(w_l3) == [u"a", u"b", u"c"]
+ assert space.listview_utf8(w_l4) == [u"a", u"b", u"c"]
def test_pop_without_argument_is_fast(self):
space = self.space
@@ -722,7 +722,7 @@
def test_listview_unicode_list(self):
space = self.space
w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
- assert self.space.listview_unicode(w_l) == [u"a", u"b"]
+ assert self.space.listview_utf8(w_l) == [u"a", u"b"]
def test_listview_int_list(self):
space = self.space
diff --git a/pypy/objspace/std/test/test_setstrategies.py b/pypy/objspace/std/test/test_setstrategies.py
--- a/pypy/objspace/std/test/test_setstrategies.py
+++ b/pypy/objspace/std/test/test_setstrategies.py
@@ -42,7 +42,6 @@
assert s1.strategy is self.space.fromcache(ObjectSetStrategy)
def test_switch_to_unicode(self):
- py.test.skip("disabled")
s = W_SetObject(self.space, self.wrapped([]))
s.add(self.space.wrap(u"six"))
assert s.strategy is self.space.fromcache(UnicodeSetStrategy)
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -28,7 +28,7 @@
def test_listview_unicode(self):
w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
- assert self.space.listview_unicode(w_str) == list(u"abcd")
+ assert self.space.listview_utf8(w_str) == list("abcd")
def test_new_shortcut(self):
space = self.space
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -196,10 +196,6 @@
def _islinebreak(self, s, pos):
return rutf8.islinebreak(s, pos)
- def _newlist_unwrapped(self, space, lst):
- assert False, "should not be called"
- return space.newlist_unicode(lst)
-
@staticmethod
@unwrap_spec(w_string=WrappedDefault(""))
def descr_new(space, w_unicodetype, w_string, w_encoding=None,
@@ -503,11 +499,11 @@
_StringMethods_descr_join = descr_join
def descr_join(self, space, w_list):
l = space.listview_utf8(w_list)
- if l is not None:
- xxxx
+ if l is not None and self.is_ascii():
if len(l) == 1:
- return space.newunicode(l[0])
- return space.newunicode(self._utf8).join(l)
+ return space.newutf8(l[0], len(l[0]), rutf8.FLAG_ASCII)
+ s = self._utf8.join(l)
+ return space.newutf8(s, len(s), rutf8.FLAG_ASCII)
return self._StringMethods_descr_join(space, w_list)
def _join_return_one(self, space, w_obj):
@@ -755,14 +751,14 @@
value = self._utf8
if space.is_none(w_sep):
res = split(value, maxsplit=maxsplit, isutf8=True)
- return space.newlist_utf8(res)
+ return space.newlist_utf8(res, self.is_ascii())
by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
res = split(value, by, maxsplit, isutf8=True)
- return space.newlist_utf8(res)
+ return space.newlist_utf8(res, self.is_ascii())
@unwrap_spec(maxsplit=int)
def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
@@ -770,14 +766,14 @@
value = self._utf8
if space.is_none(w_sep):
res = rsplit(value, maxsplit=maxsplit, isutf8=True)
- return space.newlist_utf8(res)
+ return space.newlist_utf8(res, self.is_ascii())
by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
res = rsplit(value, by, maxsplit, isutf8=True)
- return space.newlist_utf8(res)
+ return space.newlist_utf8(res, self.is_ascii())
def descr_getitem(self, space, w_index):
if isinstance(w_index, W_SliceObject):
More information about the pypy-commit
mailing list