[pypy-commit] pypy unicode-utf8-py3: fixes, start to handle some edge cases
mattip
pypy.commits at gmail.com
Mon Jul 9 00:45:49 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94835:f287dec62c4e
Date: 2018-07-08 21:38 -0700
http://bitbucket.org/pypy/pypy/changeset/f287dec62c4e/
Log: fixes, start to handle some edge cases
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -122,7 +122,7 @@
if w_fill is None:
w_fill = space.w_None
if space.is_w(w_type, space.w_dict):
- ulist = space.listview_unicode(w_keys)
+ ulist = space.listview_utf8(w_keys)
if ulist is not None:
strategy = space.fromcache(UnicodeDictStrategy)
storage = strategy.get_storage_fromkeys(ulist, w_fill)
@@ -1183,21 +1183,21 @@
# we should implement the same shortcuts as we do for BytesDictStrategy
- ## def setitem_str(self, w_dict, key, w_value):
- ## assert key is not None
- ## self.unerase(w_dict.dstorage)[key] = w_value
+ def setitem_str(self, w_dict, key, w_value):
+ assert key is not None
+ self.unerase(w_dict.dstorage)[key] = w_value
- ## def getitem(self, w_dict, w_key):
- ## space = self.space
- ## # -- This is called extremely often. Hack for performance --
- ## if type(w_key) is space.StringObjectCls:
- ## return self.getitem_str(w_dict, w_key.unwrap(space))
- ## # -- End of performance hack --
- ## return AbstractTypedStrategy.getitem(self, w_dict, w_key)
+ def getitem(self, w_dict, w_key):
+ space = self.space
+ # -- This is called extremely often. Hack for performance --
+ if type(w_key) is space.StringObjectCls:
+ return self.getitem_str(w_dict, w_key.unwrap(space))
+ # -- End of performance hack --
+ return AbstractTypedStrategy.getitem(self, w_dict, w_key)
- ## def getitem_str(self, w_dict, key):
- ## assert key is not None
- ## return self.unerase(w_dict.dstorage).get(key, None)
+ def getitem_str(self, w_dict, key):
+ assert key is not None
+ return self.unerase(w_dict.dstorage).get(key, None)
def listview_utf8(self, w_dict):
return self.unerase(w_dict.dstorage).keys()
@@ -1208,18 +1208,26 @@
def wrapkey(space, key):
return space.newutf8(key, len(key))
- ## @jit.look_inside_iff(lambda self, w_dict:
- ## w_dict_unrolling_heuristic(w_dict))
- ## def view_as_kwargs(self, w_dict):
- ## d = self.unerase(w_dict.dstorage)
- ## l = len(d)
- ## keys, values = [None] * l, [None] * l
- ## i = 0
- ## for key, val in d.iteritems():
- ## keys[i] = key
- ## values[i] = val
- ## i += 1
- ## return keys, values
+ @jit.look_inside_iff(lambda self, w_dict:
+ w_dict_unrolling_heuristic(w_dict))
+ def view_as_kwargs(self, w_dict):
+ d = self.unerase(w_dict.dstorage)
+ l = len(d)
+ keys, values = [None] * l, [None] * l
+ i = 0
+ for key, val in d.iteritems():
+ keys[i] = key
+ values[i] = val
+ i += 1
+ return keys, values
+
+ def get_storage_fromkeys(self, keys_w, w_fill):
+ """Return an initialized storage with keys and fill values"""
+ storage = {}
+ mark_dict_non_null(storage)
+ for key in keys_w:
+ storage[key] = w_fill
+ return self.erase(storage)
create_iterator_classes(UnicodeDictStrategy)
@@ -1426,7 +1434,7 @@
typename = space.type(self).getname(space)
w_seq = space.call_function(space.w_list, self)
seq_repr = space.utf8_w(space.repr(w_seq))
- return space.newtext(b"%s(%s)" % (typename, seq_repr))
+ return space.newtext(u"%s(%s)" % (typename, seq_repr.decode('utf8')))
def descr_len(self, space):
return space.len(self.w_dict)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -327,14 +327,13 @@
return W_ListObject.newlist_bytes(self, list_s)
def newlist_text(self, list_t):
- return self.newlist_unicode([
+ return self.newlist_utf8([
str_decode_utf8(s, "string", True, None, allow_surrogates=True)[0]
for s in list_t])
- def newlist_utf8(self, list_u, is_ascii):
- if is_ascii:
- return W_ListObject.newlist_utf8(self, list_u)
- return ObjSpace.newlist_utf8(self, list_u, False)
+ def newlist_utf8(self, list_u, is_ascii=True):
+ # TODO ignoring is_ascii, is that correct?
+ return W_ListObject.newlist_utf8(self, list_u)
def newlist_int(self, list_i):
return W_ListObject.newlist_int(self, list_i)
@@ -553,8 +552,7 @@
return w_obj.listview_utf8()
if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
return w_obj.listview_utf8()
- if (isinstance(w_obj, W_UnicodeObject) and not self._uses_unicode_iter(w_obj)
- and w_obj.is_ascii()):
+ if isinstance(w_obj, W_UnicodeObject) and self._uses_unicode_iter(w_obj):
return w_obj.listview_utf8()
if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
return w_obj.getitems_utf8()
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -1247,6 +1247,11 @@
self.hash_count += 1
return unicode.__hash__(self)
+ def is_ascii(self):
+ return True
+
+ def unwrapped(self):
+ return True
# the minimal 'space' needed to use a W_DictMultiObject
class FakeSpace:
@@ -1285,15 +1290,17 @@
def text_w(self, u):
assert isinstance(u, unicode)
- return u.encode('utf-8')
+ return FakeUnicode(u)
def bytes_w(self, string):
assert isinstance(string, str)
return string
- def utf8_w(self, b):
+ def utf8_w(self, u):
+ if isinstance(u, unicode):
+ u = u.encode('utf8')
assert isinstance(u, str)
- return b
+ return u
def int_w(self, integer, allow_conversion=True):
assert isinstance(integer, int)
@@ -1301,12 +1308,17 @@
def wrap(self, obj):
if isinstance(obj, str):
- return obj.decode('ascii')
+ return FakeUnicode(obj.decode('ascii'))
return obj
def newtext(self, string):
- assert isinstance(string, str)
- return string.decode('utf-8')
+ if isinstance(string, str):
+ return FakeUnicode(string.decode('utf-8'))
+ assert isinstance(string, unicode)
+ return FakeUnicode(string)
+
+ def newutf8(self, obj, lgt):
+ return obj
def newbytes(self, obj):
return obj
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -77,22 +77,30 @@
assert space.int_w(w_index) == rexpected
expected = u.startswith(v, start)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
w_res = space.call_method(w_u, 'startswith', w_v,
space.newint(start))
assert w_res is space.newbool(expected)
expected = u.startswith(v, start, start + len1)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
w_res = space.call_method(w_u, 'startswith', w_v,
space.newint(start),
space.newint(start + len1))
assert w_res is space.newbool(expected)
expected = u.endswith(v, start)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
w_res = space.call_method(w_u, 'endswith', w_v,
space.newint(start))
assert w_res is space.newbool(expected)
expected = u.endswith(v, start, start + len1)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
w_res = space.call_method(w_u, 'endswith', w_v,
space.newint(start),
space.newint(start + len1))
@@ -102,6 +110,7 @@
space = self.space
w_uni = space.wrap(u'abcd')
assert space.text_w(w_uni) == 'abcd'
+ # TODO : how to handle this?
w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -35,6 +35,7 @@
@enforceargs(utf8str=str)
def __init__(self, utf8str, length):
assert isinstance(utf8str, bytes)
+ # TODO: how to handle surrogates
assert length >= 0
self._utf8 = utf8str
self._length = length
@@ -125,7 +126,8 @@
if isinstance(w_other, W_UnicodeObject):
return w_other
if space.isinstance_w(w_other, space.w_bytes):
- return unicode_from_bytes(space, w_other)
+ raise oefmt(space.w_TypeError,
+ "Can't convert '%T' object to str implicitly", w_other)
if strict:
raise oefmt(space.w_TypeError,
"%s arg must be None, unicode or str", strict)
@@ -142,8 +144,6 @@
def _multi_chr(self, unichar):
return unichar
- _builder = UnicodeBuilder
-
def _generic_name(self):
return "str"
@@ -373,14 +373,15 @@
return mod_format(space, w_values, self, fmt_type=FORMAT_UNICODE)
def descr_swapcase(self, space):
- input = self._utf8
- builder = rutf8.Utf8StringBuilder(len(input))
- for ch in rutf8.Utf8StringIterator(input):
+ value = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(value))
+ for ch in rutf8.Utf8StringIterator(value):
if unicodedb.isupper(ch):
- ch = unicodedb.tolower(ch)
+ codes = unicodedb.tolower_full(ch)
elif unicodedb.islower(ch):
- ch = unicodedb.toupper(ch)
- builder.append_code(ch)
+ codes = unicodedb.toupper_full(ch)
+ for c in codes:
+ builder.append_code(c)
return self.from_utf8builder(builder)
def descr_title(self, space):
@@ -393,15 +394,51 @@
input = self._utf8
builder = rutf8.Utf8StringBuilder(len(input))
previous_is_cased = False
+ i = 0
for ch in rutf8.Utf8StringIterator(input):
- if not previous_is_cased:
- ch = unicodedb.totitle(ch)
+ if ch == 0x3a3:
+ codes = [self._handle_capital_sigma(input, i),]
+ elif not previous_is_cased:
+ codes = unicodedb.totitle_full(ch)
else:
- ch = unicodedb.tolower(ch)
- builder.append_code(ch)
- previous_is_cased = unicodedb.iscased(ch)
+ codes = unicodedb.tolower_full(ch)
+ for c in codes:
+ builder.append_code(c)
+ previous_is_cased = unicodedb.iscased(codes[-1])
+ i += 1
return self.from_utf8builder(builder)
+ def _handle_capital_sigma(self, value, i):
+ # U+03A3 is in the Final_Sigma context when, it is found like this:
+ #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased})
+ # where \p{xxx} is a character with property xxx.
+
+ # TODO: find a better way for utf8 -> codepoints
+ value = [ch for ch in rutf8.Utf8StringIterator(value)]
+ j = i - 1
+ final_sigma = False
+ while j >= 0:
+ ch = value[j]
+ if unicodedb.iscaseignorable(ch):
+ j -= 1
+ continue
+ final_sigma = unicodedb.iscased(ch)
+ break
+ if final_sigma:
+ j = i + 1
+ length = len(value)
+ while j < length:
+ ch = value[j]
+ if unicodedb.iscaseignorable(ch):
+ j += 1
+ continue
+ final_sigma = not unicodedb.iscased(ch)
+ break
+ if final_sigma:
+ return 0x3C2
+ else:
+ return 0x3C3
+
def descr_translate(self, space, w_table):
builder = rutf8.Utf8StringBuilder(len(self._utf8))
for codepoint in rutf8.Utf8StringIterator(self._utf8):
@@ -519,23 +556,29 @@
return space.is_w(space.type(w_obj), space.w_unicode)
def descr_casefold(self, space):
- value = self._val(space)
- builder = self._builder(len(value))
- for c in value:
- c_ord = ord(c)
- folded = unicodedb.casefold_lookup(c_ord)
+ value = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(value))
+ for ch in rutf8.Utf8StringIterator(value):
+ folded = unicodedb.casefold_lookup(ch)
if folded is None:
- builder.append(unichr(unicodedb.tolower(c_ord)))
+ builder.append_code(unicodedb.tolower(ch))
else:
for r in folded:
- builder.append(unichr(r))
- return self._new(builder.build())
+ builder.append_code(r)
+ return self.from_utf8builder(builder)
def descr_lower(self, space):
- builder = rutf8.Utf8StringBuilder(len(self._utf8))
- for ch in rutf8.Utf8StringIterator(self._utf8):
- lower = unicodedb.tolower(ch)
- builder.append_code(lower)
+ value = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(value))
+ i = 0
+ for ch in rutf8.Utf8StringIterator(value):
+ if ch == 0x3a3:
+ codes = [self._handle_capital_sigma(value, i),]
+ else:
+ codes = unicodedb.tolower_full(ch)
+ for c in codes:
+ builder.append_code(c)
+ i += 1
return self.from_utf8builder(builder)
def descr_isdecimal(self, space):
@@ -589,11 +632,18 @@
value = self._utf8
if space.isinstance_w(w_prefix, space.w_tuple):
return self._startswith_tuple(space, value, w_prefix, start, end)
- return space.newbool(self._startswith(space, value, w_prefix, start,
+ try:
+ return space.newbool(self._startswith(space, value, w_prefix, start,
end))
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ raise oefmt(space.w_TypeError, 'startswith first arg must be str '
+ 'or a tuple of str, not %T', w_prefix)
def _startswith(self, space, value, w_prefix, start, end):
prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+ if start > len(value):
+ return False
if len(prefix) == 0:
return True
return startswith(value, prefix, start, end)
@@ -603,11 +653,18 @@
value = self._utf8
if space.isinstance_w(w_suffix, space.w_tuple):
return self._endswith_tuple(space, value, w_suffix, start, end)
- return space.newbool(self._endswith(space, value, w_suffix, start,
+ try:
+ return space.newbool(self._endswith(space, value, w_suffix, start,
end))
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ raise oefmt(space.w_TypeError, 'endswith first arg must be str '
+ 'or a tuple of str, not %T', w_suffix)
def _endswith(self, space, value, w_prefix, start, end):
prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+ if start > len(value):
+ return False
if len(prefix) == 0:
return True
return endswith(value, prefix, start, end)
@@ -684,8 +741,9 @@
def descr_upper(self, space):
builder = rutf8.Utf8StringBuilder(len(self._utf8))
for ch in rutf8.Utf8StringIterator(self._utf8):
- ch = unicodedb.toupper(ch)
- builder.append_code(ch)
+ codes = unicodedb.toupper_full(ch)
+ for c in codes:
+ builder.append_code(c)
return self.from_utf8builder(builder)
@unwrap_spec(width=int)
@@ -792,14 +850,16 @@
builder = rutf8.Utf8StringBuilder(len(self._utf8))
it = rutf8.Utf8StringIterator(self._utf8)
uchar = it.next()
- ch = unicodedb.toupper(uchar)
- builder.append_code(ch)
+ codes = unicodedb.toupper_full(uchar)
+ # can sometimes give more than one, like for omega-with-Ypogegrammeni, 8179
+ for c in codes:
+ builder.append_code(c)
for ch in it:
ch = unicodedb.tolower(ch)
builder.append_code(ch)
return self.from_utf8builder(builder)
- @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
def descr_center(self, space, width, w_fillchar):
value = self._utf8
fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
@@ -978,14 +1038,14 @@
end_index = len(self._utf8)
if start > 0:
if start > self._length:
- start_index = end_index
+ start_index = end_index + 1
else:
start_index = self._index_to_byte(start)
if end < self._length:
end_index = self._index_to_byte(end)
return (start_index, end_index)
- @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
def descr_rjust(self, space, width, w_fillchar):
value = self._utf8
lgt = self._len()
@@ -1004,7 +1064,7 @@
return W_UnicodeObject(value, lgt)
- @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
def descr_ljust(self, space, width, w_fillchar):
value = self._utf8
w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
@@ -1080,23 +1140,11 @@
def descr_isprintable(self, space):
- for uchar in self._value:
- if not unicodedb.isprintable(ord(uchar)):
+ for ch in rutf8.Utf8StringIterator(self._utf8):
+ if not unicodedb.isprintable(ch):
return space.w_False
return space.w_True
- def _fix_fillchar(func):
- # XXX: hack
- from rpython.tool.sourcetools import func_with_new_name
- func = func_with_new_name(func, func.__name__)
- func.unwrap_spec = func.unwrap_spec.copy()
- func.unwrap_spec['w_fillchar'] = WrappedDefault(u' ')
- return func
-
- descr_center = _fix_fillchar(StringMethods.descr_center)
- descr_ljust = _fix_fillchar(StringMethods.descr_ljust)
- descr_rjust = _fix_fillchar(StringMethods.descr_rjust)
-
@staticmethod
def _iter_getitem_result(self, space, index):
assert isinstance(self, W_UnicodeObject)
@@ -1172,7 +1220,7 @@
def decode_object(space, w_obj, encoding, errors):
if encoding is None:
encoding = getdefaultencoding(space)
- if errors is None or errors == 'strict':
+ if errors is None or errors == 'strict' or errors == 'surrogateescape':
if encoding == 'ascii':
s = space.charbuf_w(w_obj)
unicodehelper.check_ascii_or_raise(space, s)
@@ -1824,7 +1872,7 @@
def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
if not isinstance(w_unistr, W_UnicodeObject):
raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
- value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space))
+ value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8'))
return unicodehelper.encode_utf8(space, value,
allow_surrogates=allow_surrogates)
More information about the pypy-commit
mailing list