[pypy-commit] pypy unicode-utf8: improve the slice tests and fix it

Thu Oct 26 14:11:41 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92855:84d1ebd9002d
Date: 2017-10-26 20:11 +0200
http://bitbucket.org/pypy/pypy/changeset/84d1ebd9002d/

Log:	improve the slice tests and fix it

diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -907,16 +907,31 @@
 
     def test_getslice(self):
         assert u'123456'.__getslice__(1, 5) == u'2345'
-        s = u"abc"
-        assert s[:] == "abc"
-        assert s[1:] == "bc"
-        assert s[:2] == "ab"
-        assert s[1:2] == "b"
-        assert s[-2:] == "bc"
-        assert s[:-1] == "ab"
-        assert s[-2:2] == "b"
-        assert s[1:-1] == "b"
-        assert s[-2:-1] == "b"
+        s = u"\u0105b\u0107"
+        assert s[:] == u"\u0105b\u0107"
+        assert s[1:] == u"b\u0107"
+        assert s[:2] == u"\u0105b"
+        assert s[1:2] == u"b"
+        assert s[-2:] == u"b\u0107"
+        assert s[:-1] == u"\u0105b"
+        assert s[-2:2] == u"b"
+        assert s[1:-1] == u"b"
+        assert s[-2:-1] == u"b"
+
+    def test_getitem_slice(self):
+        assert u'123456'.__getitem__(slice(1, 5)) == u'2345'
+        s = u"\u0105b\u0107"
+        assert s[slice(3)] == u"\u0105b\u0107"
+        assert s[slice(1, 3)] == u"b\u0107"
+        assert s[slice(2)] == u"\u0105b"
+        assert s[slice(1,2)] == u"b"
+        assert s[slice(-2,3)] == u"b\u0107"
+        assert s[slice(-1)] == u"\u0105b"
+        assert s[slice(-2,2)] == u"b"
+        assert s[slice(1,-1)] == u"b"
+        assert s[slice(-2,-1)] == u"b"
+        assert u"abcde"[::2] == u"ace"
+        assert u"\u0105\u0106\u0107abcd"[::2] == u"\u0105\u0107bd"
 
     def test_no_len_on_str_iter(self):
         iterable = u"hello"
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -21,7 +21,7 @@
 from pypy.objspace.std import newformat
 from pypy.objspace.std.basestringtype import basestring_typedef
 from pypy.objspace.std.formatting import mod_format
-from pypy.objspace.std.sliceobject import (
+from pypy.objspace.std.sliceobject import (W_SliceObject,
     unwrap_start_stop, normalize_simple_slice)
 from pypy.objspace.std.stringmethods import StringMethods
 from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
@@ -724,8 +724,36 @@
 
         return space.newlist_utf8(res)
 
+    def descr_getitem(self, space, w_index):
+        if isinstance(w_index, W_SliceObject):
+            length = self._len()
+            start, stop, step, sl = w_index.indices4(space, length)
+            if sl == 0:
+                return self._empty()
+            elif step == 1:
+                assert start >= 0 and stop >= 0
+                return self._unicode_sliced(space, start, stop)
+            else:
+                return self._getitem_slice_slowpath(space, start, step, sl)
+
+        index = space.getindex_w(w_index, space.w_IndexError, "string index")
+        return self._getitem_result(space, index)
+
+    def _getitem_slice_slowpath(self, space, start, step, sl):
+        # XXX same comment as in _unicode_sliced
+        builder = StringBuilder(step * sl)
+        byte_pos = self._index_to_byte(start)
+        i = 0
+        while True:
+            next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos)
+            builder.append(self._utf8[byte_pos:next_pos])
+            if i == sl - 1:
+                break
+            i += 1
+            byte_pos = self._index_to_byte(start + i * step)
+        return W_UnicodeObject(builder.build(), sl)
+
     def descr_getslice(self, space, w_start, w_stop):
-        selfvalue = self._utf8
         start, stop = normalize_simple_slice(
             space, self._len(), w_start, w_stop)
         if start == stop: