[pypy-commit] pypy unicode-utf8-py3: fixes, start to handle some edge cases

Mon Jul 9 00:45:49 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94835:f287dec62c4e
Date: 2018-07-08 21:38 -0700
http://bitbucket.org/pypy/pypy/changeset/f287dec62c4e/

Log:	fixes, start to handle some edge cases

diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -122,7 +122,7 @@
         if w_fill is None:
             w_fill = space.w_None
         if space.is_w(w_type, space.w_dict):
-            ulist = space.listview_unicode(w_keys)
+            ulist = space.listview_utf8(w_keys)
             if ulist is not None:
                 strategy = space.fromcache(UnicodeDictStrategy)
                 storage = strategy.get_storage_fromkeys(ulist, w_fill)
@@ -1183,21 +1183,21 @@
 
     # we should implement the same shortcuts as we do for BytesDictStrategy
 
-    ## def setitem_str(self, w_dict, key, w_value):
-    ##     assert key is not None
-    ##     self.unerase(w_dict.dstorage)[key] = w_value
+    def setitem_str(self, w_dict, key, w_value):
+        assert key is not None
+        self.unerase(w_dict.dstorage)[key] = w_value
 
-    ## def getitem(self, w_dict, w_key):
-    ##     space = self.space
-    ##     # -- This is called extremely often.  Hack for performance --
-    ##     if type(w_key) is space.StringObjectCls:
-    ##         return self.getitem_str(w_dict, w_key.unwrap(space))
-    ##     # -- End of performance hack --
-    ##     return AbstractTypedStrategy.getitem(self, w_dict, w_key)
+    def getitem(self, w_dict, w_key):
+        space = self.space
+        # -- This is called extremely often.  Hack for performance --
+        if type(w_key) is space.StringObjectCls:
+             return self.getitem_str(w_dict, w_key.unwrap(space))
+        # -- End of performance hack --
+        return AbstractTypedStrategy.getitem(self, w_dict, w_key)
 
-    ## def getitem_str(self, w_dict, key):
-    ##     assert key is not None
-    ##     return self.unerase(w_dict.dstorage).get(key, None)
+    def getitem_str(self, w_dict, key):
+        assert key is not None
+        return self.unerase(w_dict.dstorage).get(key, None)
 
     def listview_utf8(self, w_dict):
         return self.unerase(w_dict.dstorage).keys()
@@ -1208,18 +1208,26 @@
     def wrapkey(space, key):
         return space.newutf8(key, len(key))
 
-    ## @jit.look_inside_iff(lambda self, w_dict:
-    ##                      w_dict_unrolling_heuristic(w_dict))
-    ## def view_as_kwargs(self, w_dict):
-    ##     d = self.unerase(w_dict.dstorage)
-    ##     l = len(d)
-    ##     keys, values = [None] * l, [None] * l
-    ##     i = 0
-    ##     for key, val in d.iteritems():
-    ##         keys[i] = key
-    ##         values[i] = val
-    ##         i += 1
-    ##     return keys, values
+    @jit.look_inside_iff(lambda self, w_dict:
+                         w_dict_unrolling_heuristic(w_dict))
+    def view_as_kwargs(self, w_dict):
+        d = self.unerase(w_dict.dstorage)
+        l = len(d)
+        keys, values = [None] * l, [None] * l
+        i = 0
+        for key, val in d.iteritems():
+            keys[i] = key
+            values[i] = val
+            i += 1
+        return keys, values
+
+    def get_storage_fromkeys(self, keys_w, w_fill):
+        """Return an initialized storage with keys and fill values"""
+        storage = {}
+        mark_dict_non_null(storage)
+        for key in keys_w:
+            storage[key] = w_fill
+        return self.erase(storage)
 
 create_iterator_classes(UnicodeDictStrategy)
 
@@ -1426,7 +1434,7 @@
         typename = space.type(self).getname(space)
         w_seq = space.call_function(space.w_list, self)
         seq_repr = space.utf8_w(space.repr(w_seq))
-        return space.newtext(b"%s(%s)" % (typename, seq_repr))
+        return space.newtext(u"%s(%s)" % (typename, seq_repr.decode('utf8')))
 
     def descr_len(self, space):
         return space.len(self.w_dict)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -327,14 +327,13 @@
         return W_ListObject.newlist_bytes(self, list_s)
 
     def newlist_text(self, list_t):
-        return self.newlist_unicode([
+        return self.newlist_utf8([
             str_decode_utf8(s, "string", True, None, allow_surrogates=True)[0]
                      for s in list_t])
 
-    def newlist_utf8(self, list_u, is_ascii):
-        if is_ascii:
-            return W_ListObject.newlist_utf8(self, list_u)
-        return ObjSpace.newlist_utf8(self, list_u, False)
+    def newlist_utf8(self, list_u, is_ascii=True):
+        # TODO ignoring is_ascii, is that correct?
+        return W_ListObject.newlist_utf8(self, list_u)
 
     def newlist_int(self, list_i):
         return W_ListObject.newlist_int(self, list_i)
@@ -553,8 +552,7 @@
             return w_obj.listview_utf8()
         if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
             return w_obj.listview_utf8()
-        if (isinstance(w_obj, W_UnicodeObject) and not self._uses_unicode_iter(w_obj)
-            and w_obj.is_ascii()):
+        if isinstance(w_obj, W_UnicodeObject) and self._uses_unicode_iter(w_obj):
             return w_obj.listview_utf8()
         if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
             return w_obj.getitems_utf8()
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -1247,6 +1247,11 @@
         self.hash_count += 1
         return unicode.__hash__(self)
 
+    def is_ascii(self):
+        return True
+
+    def unwrapped(self):
+        return True
 
 # the minimal 'space' needed to use a W_DictMultiObject
 class FakeSpace:
@@ -1285,15 +1290,17 @@
 
     def text_w(self, u):
         assert isinstance(u, unicode)
-        return u.encode('utf-8')
+        return FakeUnicode(u)
 
     def bytes_w(self, string):
         assert isinstance(string, str)
         return string
 
-    def utf8_w(self, b):
+    def utf8_w(self, u):
+        if isinstance(u, unicode):
+            u = u.encode('utf8')
         assert isinstance(u, str)
-        return b
+        return u
 
     def int_w(self, integer, allow_conversion=True):
         assert isinstance(integer, int)
@@ -1301,12 +1308,17 @@
 
     def wrap(self, obj):
         if isinstance(obj, str):
-            return obj.decode('ascii')
+            return FakeUnicode(obj.decode('ascii'))
         return obj
 
     def newtext(self, string):
-        assert isinstance(string, str)
-        return string.decode('utf-8')
+        if isinstance(string, str):
+            return FakeUnicode(string.decode('utf-8'))
+        assert isinstance(string, unicode)
+        return FakeUnicode(string)
+
+    def newutf8(self, obj, lgt):
+        return obj
 
     def newbytes(self, obj):
         return obj
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -77,22 +77,30 @@
         assert space.int_w(w_index) == rexpected
 
         expected = u.startswith(v, start)
+        if expected and start > len(u):
+            expected = False # python2 vs. python3
         w_res = space.call_method(w_u, 'startswith', w_v,
                                   space.newint(start))
         assert w_res is space.newbool(expected)
 
         expected = u.startswith(v, start, start + len1)
+        if expected and start > len(u):
+            expected = False # python2 vs. python3
         w_res = space.call_method(w_u, 'startswith', w_v,
                                   space.newint(start),
                                   space.newint(start + len1))
         assert w_res is space.newbool(expected)
 
         expected = u.endswith(v, start)
+        if expected and start > len(u):
+            expected = False # python2 vs. python3
         w_res = space.call_method(w_u, 'endswith', w_v,
                                   space.newint(start))
         assert w_res is space.newbool(expected)
 
         expected = u.endswith(v, start, start + len1)
+        if expected and start > len(u):
+            expected = False # python2 vs. python3
         w_res = space.call_method(w_u, 'endswith', w_v,
                                   space.newint(start),
                                   space.newint(start + len1))
@@ -102,6 +110,7 @@
         space = self.space
         w_uni = space.wrap(u'abcd')
         assert space.text_w(w_uni) == 'abcd'
+        # TODO : how to handle this?
         w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
         space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni)
 
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -35,6 +35,7 @@
     @enforceargs(utf8str=str)
     def __init__(self, utf8str, length):
         assert isinstance(utf8str, bytes)
+        # TODO: how to handle surrogates
         assert length >= 0
         self._utf8 = utf8str
         self._length = length
@@ -125,7 +126,8 @@
         if isinstance(w_other, W_UnicodeObject):
             return w_other
         if space.isinstance_w(w_other, space.w_bytes):
-            return unicode_from_bytes(space, w_other)
+            raise oefmt(space.w_TypeError,
+                    "Can't convert '%T' object to str implicitly", w_other)
         if strict:
             raise oefmt(space.w_TypeError,
                 "%s arg must be None, unicode or str", strict)
@@ -142,8 +144,6 @@
     def _multi_chr(self, unichar):
         return unichar
 
-    _builder = UnicodeBuilder
-
     def _generic_name(self):
         return "str"
 
@@ -373,14 +373,15 @@
         return mod_format(space, w_values, self, fmt_type=FORMAT_UNICODE)
 
     def descr_swapcase(self, space):
-        input = self._utf8
-        builder = rutf8.Utf8StringBuilder(len(input))
-        for ch in rutf8.Utf8StringIterator(input):
+        value = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(value))
+        for ch in rutf8.Utf8StringIterator(value):
             if unicodedb.isupper(ch):
-                ch = unicodedb.tolower(ch)
+                codes = unicodedb.tolower_full(ch)
             elif unicodedb.islower(ch):
-                ch = unicodedb.toupper(ch)
-            builder.append_code(ch)
+                codes = unicodedb.toupper_full(ch)
+            for c in codes:
+                builder.append_code(c)
         return self.from_utf8builder(builder)
 
     def descr_title(self, space):
@@ -393,15 +394,51 @@
         input = self._utf8
         builder = rutf8.Utf8StringBuilder(len(input))
         previous_is_cased = False
+        i = 0
         for ch in rutf8.Utf8StringIterator(input):
-            if not previous_is_cased:
-                ch = unicodedb.totitle(ch)
+            if ch == 0x3a3:
+                codes = [self._handle_capital_sigma(input, i),]
+            elif not previous_is_cased:
+                codes = unicodedb.totitle_full(ch)
             else:
-                ch = unicodedb.tolower(ch)
-            builder.append_code(ch)
-            previous_is_cased = unicodedb.iscased(ch)
+                codes = unicodedb.tolower_full(ch)
+            for c in codes:
+                builder.append_code(c)
+            previous_is_cased = unicodedb.iscased(codes[-1])
+            i += 1
         return self.from_utf8builder(builder)
 
+    def _handle_capital_sigma(self, value, i):
+        # U+03A3 is in the Final_Sigma context when, it is found like this:
+        #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased})
+        # where \p{xxx} is a character with property xxx.
+
+        # TODO: find a better way for utf8 -> codepoints
+        value = [ch for ch in rutf8.Utf8StringIterator(value)]
+        j = i - 1
+        final_sigma = False
+        while j >= 0:
+            ch = value[j]
+            if unicodedb.iscaseignorable(ch):
+                j -= 1
+                continue
+            final_sigma = unicodedb.iscased(ch)
+            break
+        if final_sigma:
+            j = i + 1
+            length = len(value)
+            while j < length:
+                ch = value[j]
+                if unicodedb.iscaseignorable(ch):
+                    j += 1
+                    continue
+                final_sigma = not unicodedb.iscased(ch)
+                break
+        if final_sigma:
+            return 0x3C2
+        else:
+            return 0x3C3
+
     def descr_translate(self, space, w_table):
         builder = rutf8.Utf8StringBuilder(len(self._utf8))
         for codepoint in rutf8.Utf8StringIterator(self._utf8):
@@ -519,23 +556,29 @@
         return space.is_w(space.type(w_obj), space.w_unicode)
 
     def descr_casefold(self, space):
-        value = self._val(space)
-        builder = self._builder(len(value))
-        for c in value:
-            c_ord = ord(c)
-            folded = unicodedb.casefold_lookup(c_ord)
+        value = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(value))
+        for ch in rutf8.Utf8StringIterator(value):
+            folded = unicodedb.casefold_lookup(ch)
             if folded is None:
-                builder.append(unichr(unicodedb.tolower(c_ord)))
+                builder.append_code(unicodedb.tolower(ch))
             else:
                 for r in folded:
-                    builder.append(unichr(r))
-        return self._new(builder.build())
+                    builder.append_code(r)
+        return self.from_utf8builder(builder)
 
     def descr_lower(self, space):
-        builder = rutf8.Utf8StringBuilder(len(self._utf8))
-        for ch in rutf8.Utf8StringIterator(self._utf8):
-            lower = unicodedb.tolower(ch)
-            builder.append_code(lower)
+        value = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(value))
+        i = 0
+        for ch in rutf8.Utf8StringIterator(value):
+            if ch == 0x3a3:
+                codes = [self._handle_capital_sigma(value, i),]
+            else:
+                codes = unicodedb.tolower_full(ch)
+            for c in codes:
+                builder.append_code(c)
+            i += 1
         return self.from_utf8builder(builder)
 
     def descr_isdecimal(self, space):
@@ -589,11 +632,18 @@
         value = self._utf8
         if space.isinstance_w(w_prefix, space.w_tuple):
             return self._startswith_tuple(space, value, w_prefix, start, end)
-        return space.newbool(self._startswith(space, value, w_prefix, start,
+        try:
+            return space.newbool(self._startswith(space, value, w_prefix, start,
                                               end))
+        except OperationError as e:
+            if e.match(space, space.w_TypeError):
+                raise oefmt(space.w_TypeError, 'startswith first arg must be str '
+                        'or a tuple of str, not %T', w_prefix)
 
     def _startswith(self, space, value, w_prefix, start, end):
         prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+        if start > len(value):
+            return False
         if len(prefix) == 0:
             return True
         return startswith(value, prefix, start, end)
@@ -603,11 +653,18 @@
         value = self._utf8
         if space.isinstance_w(w_suffix, space.w_tuple):
             return self._endswith_tuple(space, value, w_suffix, start, end)
-        return space.newbool(self._endswith(space, value, w_suffix, start,
+        try:
+            return space.newbool(self._endswith(space, value, w_suffix, start,
                                             end))
+        except OperationError as e:
+            if e.match(space, space.w_TypeError):
+                raise oefmt(space.w_TypeError, 'endswith first arg must be str '
+                        'or a tuple of str, not %T', w_suffix)
 
     def _endswith(self, space, value, w_prefix, start, end):
         prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+        if start > len(value):
+            return False
         if len(prefix) == 0:
             return True
         return endswith(value, prefix, start, end)
@@ -684,8 +741,9 @@
     def descr_upper(self, space):
         builder = rutf8.Utf8StringBuilder(len(self._utf8))
         for ch in rutf8.Utf8StringIterator(self._utf8):
-            ch = unicodedb.toupper(ch)
-            builder.append_code(ch)
+            codes = unicodedb.toupper_full(ch)
+            for c in codes:
+                builder.append_code(c)
         return self.from_utf8builder(builder)
 
     @unwrap_spec(width=int)
@@ -792,14 +850,16 @@
         builder = rutf8.Utf8StringBuilder(len(self._utf8))
         it = rutf8.Utf8StringIterator(self._utf8)
         uchar = it.next()
-        ch = unicodedb.toupper(uchar)
-        builder.append_code(ch)
+        codes = unicodedb.toupper_full(uchar)
+        # can sometimes give more than one, like for omega-with-Ypogegrammeni, 8179
+        for c in codes:
+            builder.append_code(c)
         for ch in it:
             ch = unicodedb.tolower(ch)
             builder.append_code(ch)
         return self.from_utf8builder(builder)
 
-    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
     def descr_center(self, space, width, w_fillchar):
         value = self._utf8
         fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
@@ -978,14 +1038,14 @@
         end_index = len(self._utf8)
         if start > 0:
             if start > self._length:
-                start_index = end_index
+                start_index = end_index + 1
             else:
                 start_index = self._index_to_byte(start)
         if end < self._length:
             end_index = self._index_to_byte(end)
         return (start_index, end_index)
 
-    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
     def descr_rjust(self, space, width, w_fillchar):
         value = self._utf8
         lgt = self._len()
@@ -1004,7 +1064,7 @@
 
         return W_UnicodeObject(value, lgt)
 
-    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
     def descr_ljust(self, space, width, w_fillchar):
         value = self._utf8
         w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
@@ -1080,23 +1140,11 @@
 
 
     def descr_isprintable(self, space):
-        for uchar in self._value:
-            if not unicodedb.isprintable(ord(uchar)):
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            if not unicodedb.isprintable(ch):
                 return space.w_False
         return space.w_True
 
-    def _fix_fillchar(func):
-        # XXX: hack
-        from rpython.tool.sourcetools import func_with_new_name
-        func = func_with_new_name(func, func.__name__)
-        func.unwrap_spec = func.unwrap_spec.copy()
-        func.unwrap_spec['w_fillchar'] = WrappedDefault(u' ')
-        return func
-
-    descr_center = _fix_fillchar(StringMethods.descr_center)
-    descr_ljust = _fix_fillchar(StringMethods.descr_ljust)
-    descr_rjust = _fix_fillchar(StringMethods.descr_rjust)
-
     @staticmethod
     def _iter_getitem_result(self, space, index):
         assert isinstance(self, W_UnicodeObject)
@@ -1172,7 +1220,7 @@
 def decode_object(space, w_obj, encoding, errors):
     if encoding is None:
         encoding = getdefaultencoding(space)
-    if errors is None or errors == 'strict':
+    if errors is None or errors == 'strict' or errors == 'surrogateescape':
         if encoding == 'ascii':
             s = space.charbuf_w(w_obj)
             unicodehelper.check_ascii_or_raise(space, s)
@@ -1824,7 +1872,7 @@
 def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
-    value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space))
+    value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8'))
     return unicodehelper.encode_utf8(space, value,
                                      allow_surrogates=allow_surrogates)