[pypy-commit] pypy default: Backport test additions and cleanups from unicode-utf8

Tue Dec 12 00:02:28 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: 
Changeset: r93381:458ccc9243cd
Date: 2017-12-12 04:57 +0000
http://bitbucket.org/pypy/pypy/changeset/458ccc9243cd/

Log:	Backport test additions and cleanups from unicode-utf8

diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1246,3 +1246,7 @@
         exc = py.test.raises(SyntaxError, self.get_ast, input).value
         assert exc.msg == ("(unicode error) 'unicodeescape' codec can't decode"
                            " bytes in position 0-1: truncated \\xXX escape")
+        input = "u'\\x1'"
+        exc = py.test.raises(SyntaxError, self.get_ast, input).value
+        assert exc.msg == ("(unicode error) 'unicodeescape' codec can't decode"
+                           " bytes in position 0-2: truncated \\xXX escape")
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -210,7 +210,8 @@
 def xmlcharrefreplace_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
+        obj = space.realunicode_w(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
@@ -236,7 +237,8 @@
 def backslashreplace_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
+        obj = space.realunicode_w(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -115,10 +115,10 @@
         raises(TypeError, charmap_decode, '\xff', "strict",  {0xff: 0x110000})
         assert (charmap_decode("\x00\x01\x02", "strict",
                                {0: 0x10FFFF, 1: ord('b'), 2: ord('c')}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
         assert (charmap_decode("\x00\x01\x02", "strict",
                                {0: u'\U0010FFFF', 1: u'b', 2: u'c'}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
 
     def test_escape_decode_errors(self):
         from _codecs import escape_decode as decode
@@ -537,8 +537,12 @@
         assert '\xff'.decode('utf-7', 'ignore') == ''
         assert '\x00'.decode('unicode-internal', 'ignore') == ''
 
-    def test_backslahreplace(self):
-        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000'
+    def test_backslashreplace(self):
+        sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+        expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+        assert sin.encode('ascii', 'backslashreplace') == expected
+        expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+        assert sin.encode("iso-8859-15", "backslashreplace") == expected
 
     def test_badhandler(self):
         import codecs
@@ -592,11 +596,11 @@
         def handler_unicodeinternal(exc):
             if not isinstance(exc, UnicodeDecodeError):
                 raise TypeError("don't know how to handle %r" % exc)
-            return (u"\x01", 1)
+            return (u"\x01", 4)
         codecs.register_error("test.hui", handler_unicodeinternal)
         res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
         if sys.maxunicode > 65535:
-            assert res == u"\u0000\u0001\u0000"   # UCS4 build
+            assert res == u"\u0000\u0001"   # UCS4 build
         else:
             assert res == u"\x00\x00\x01\x00\x00" # UCS2 build
 
@@ -750,3 +754,31 @@
         assert _codecs.unicode_escape_decode(b) == (u'', 0)
         assert _codecs.raw_unicode_escape_decode(b) == (u'', 0)
         assert _codecs.unicode_internal_decode(b) == (u'', 0)
+
+    def test_xmlcharrefreplace(self):
+        r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('latin1', 'xmlcharrefreplace')
+        assert r == 'ሴ\x80⍅y\xab'
+        r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('ascii', 'xmlcharrefreplace')
+        assert r == 'ሴ€⍅y«'
+
+    def test_errorhandler_collection(self):
+        import _codecs
+        errors = []
+        def record_error(exc):
+            if not isinstance(exc, UnicodeEncodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            errors.append(exc.object[exc.start:exc.end])
+            return (u'', exc.end)
+        _codecs.register_error("test.record", record_error)
+
+        sin = u"\xac\u1234\u1234\u20ac\u8000"
+        assert sin.encode("ascii", "test.record") == ""
+        assert errors == [sin]
+
+        errors = []
+        assert sin.encode("latin-1", "test.record") == "\xac"
+        assert errors == [u'\u1234\u1234\u20ac\u8000']
+
+        errors = []
+        assert sin.encode("iso-8859-15", "test.record") == "\xac\xa4"
+        assert errors == [u'\u1234\u1234', u'\u8000']
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -414,6 +414,7 @@
     if not space.isinstance_w(w_decoded, space.w_unicode):
         msg = "decoder should return a string result, not '%T'"
         raise oefmt(space.w_TypeError, msg, w_decoded)
+    return w_decoded
 
 
 class W_TextIOWrapper(W_TextIOBase):
@@ -940,12 +941,13 @@
 
             w_decoded = space.call_method(self.w_decoder, "decode",
                                           w_chunk, space.newbool(bool(cookie.need_eof)))
-            self.decoded.set(space, w_decoded)
+            w_decoded = check_decoded(space, w_decoded)
 
             # Skip chars_to_skip of the decoded characters
-            if len(self.decoded.text) < cookie.chars_to_skip:
+            if space.len_w(w_decoded) < cookie.chars_to_skip:
                 raise oefmt(space.w_IOError,
                             "can't restore logical file position")
+            self.decoded.set(space, w_decoded)
             self.decoded.pos = cookie.chars_to_skip
         else:
             self.snapshot = PositionSnapshot(cookie.dec_flags, "")
@@ -958,10 +960,8 @@
 
     def tell_w(self, space):
         self._check_closed(space)
-
         if not self.seekable:
             raise oefmt(space.w_IOError, "underlying stream is not seekable")
-
         if not self.telling:
             raise oefmt(space.w_IOError,
                         "telling position disabled by next() call")
@@ -1031,14 +1031,14 @@
                 # We didn't get enough decoded data; signal EOF to get more.
                 w_decoded = space.call_method(self.w_decoder, "decode",
                                               space.newbytes(""),
-                                              space.newint(1)) # final=1
+                                              space.newint(1))  # final=1
                 check_decoded(space, w_decoded)
-                chars_decoded += len(space.unicode_w(w_decoded))
+                chars_decoded += space.len_w(w_decoded)
                 cookie.need_eof = 1
 
                 if chars_decoded < chars_to_skip:
                     raise oefmt(space.w_IOError,
-                                "can't reconstruct logical file position")
+                        "can't reconstruct logical file position")
         finally:
             space.call_method(self.w_decoder, "setstate", w_saved_state)
 
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -40,7 +40,8 @@
         w_newline=space.newtext(mode))
     lines = []
     for limit in limits:
-        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        w_line = w_textio.readline_w(space, space.newint(limit))
+        line = space.unicode_w(w_line)
         if limit >= 0:
             assert len(line) <= limit
         if line:
diff --git a/pypy/module/_pypyjson/interp_encoder.py b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -49,24 +49,24 @@
         first = 0
 
     for i in range(first, len(u)):
-        c = u[i]
-        if c <= u'~':
-            if c == u'"' or c == u'\\':
+        c = ord(u[i])
+        if c <= ord('~'):
+            if c == ord('"') or c == ord('\\'):
                 sb.append('\\')
-            elif c < u' ':
-                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+            elif c < ord(' '):
+                sb.append(ESCAPE_BEFORE_SPACE[c])
                 continue
-            sb.append(chr(ord(c)))
+            sb.append(chr(c))
         else:
-            if c <= u'\uffff':
+            if c <= ord(u'\uffff'):
                 sb.append('\\u')
-                sb.append(HEX[ord(c) >> 12])
-                sb.append(HEX[(ord(c) >> 8) & 0x0f])
-                sb.append(HEX[(ord(c) >> 4) & 0x0f])
-                sb.append(HEX[ord(c) & 0x0f])
+                sb.append(HEX[c >> 12])
+                sb.append(HEX[(c >> 8) & 0x0f])
+                sb.append(HEX[(c >> 4) & 0x0f])
+                sb.append(HEX[c & 0x0f])
             else:
                 # surrogate pair
-                n = ord(c) - 0x10000
+                n = c - 0x10000
                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
                 sb.append('\\ud')
                 sb.append(HEX[(s1 >> 8) & 0x0f])
diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
         intval: lltype.Signed
         """
         self.error(w_ffitype, w_obj)
-        
+
     def handle_unichar(self, w_ffitype, w_obj, intval):
         """
         intval: lltype.Signed
@@ -174,7 +174,7 @@
     def handle_struct_rawffi(self, w_ffitype, w_structinstance):
         """
         This method should be killed as soon as we remove support for _rawffi structures
-        
+
         w_structinstance: W_StructureInstance
         """
         self.error(w_ffitype, w_structinstance)
@@ -349,7 +349,7 @@
     def get_struct_rawffi(self, w_ffitype, w_structdescr):
         """
         This should be killed as soon as we kill support for _rawffi structures
-        
+
         Return type: lltype.Unsigned
         (the address of the structure)
         """
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -481,11 +481,13 @@
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def start_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[0])
+        start, end = self.do_span(w_groupnum)
+        return self.space.newint(start)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def end_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[1])
+        start, end = self.do_span(w_groupnum)
+        return self.space.newint(end)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def span_w(self, w_groupnum):
diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -87,6 +87,14 @@
         assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
         assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
 
+    def test_findall_unicode(self):
+        import re
+        assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+        assert ["a", "u"] == re.findall("b(.)", "abalbus")
+        assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+        assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+        assert [u"xyz"] == re.findall(u".*yz", u"xyz")
+
     def test_finditer(self):
         import re
         it = re.finditer("b(.)", "brabbel")
@@ -999,3 +1007,15 @@
         import re
         assert re.search(".+ab", "wowowowawoabwowo")
         assert None == re.search(".+ab", "wowowaowowo")
+
+
+class AppTestUnicodeExtra:
+    def test_string_attribute(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.string == u"\u1233\u1234\u1235"
+
+    def test_match_start(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.start() == 1
diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -259,10 +259,10 @@
                 result[0] = ch
 
         if not composed: # If decomposed normalization we are done
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         if j <= 1:
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         current = result[0]
         starter_pos = 0
@@ -310,7 +310,10 @@
 
         result[starter_pos] = current
 
-        return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]]))
+        return self.build(space, result, stop=next_insert)
+
+    def build(self, space, r, stop):
+        return space.newunicode(u''.join([unichr(i) for i in r[:stop]]))
 
 
 methods = {}
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -135,6 +135,11 @@
         check(u'a' + 'b', u'ab')
         check('a' + u'b', u'ab')
 
+    def test_getitem(self):
+        assert u'abc'[2] == 'c'
+        raises(IndexError, u'abc'.__getitem__, 15)
+        assert u'g\u0105\u015b\u0107'[2] == u'\u015b'
+
     def test_join(self):
         def check(a, b):
             assert a == b
@@ -171,6 +176,8 @@
         assert u'\n\n'.splitlines() == [u'', u'']
         assert u'a\nb\nc'.splitlines(1) == [u'a\n', u'b\n', u'c']
         assert u'\na\nb\n'.splitlines(1) == [u'\n', u'a\n', u'b\n']
+        assert ((u'a' + '\xc2\x85'.decode('utf8') + u'b\n').splitlines() ==
+                ['a', 'b'])
 
     def test_zfill(self):
         assert u'123'.zfill(2) == u'123'
@@ -217,6 +224,7 @@
         raises(ValueError, u'abc'.split, u'')
         raises(ValueError, 'abc'.split, u'')
         assert u'   a b c d'.split(None, 0) == [u'a b c d']
+        assert u'a\nb\u1680c'.split() == [u'a', u'b', u'c']
 
     def test_rsplit(self):
         assert u"".rsplit() == []
@@ -246,6 +254,7 @@
         raises(ValueError, 'abc'.rsplit, u'')
         assert u'  a b c  '.rsplit(None, 0) == [u'  a b c']
         assert u''.rsplit('aaa') == [u'']
+        assert u'a\nb\u1680c'.rsplit() == [u'a', u'b', u'c']
 
     def test_split_rsplit_str_unicode(self):
         x = 'abc'.split(u'b')
@@ -291,6 +300,8 @@
         assert u"bROWN fOX".title() == u"Brown Fox"
         assert u"Brown Fox".title() == u"Brown Fox"
         assert u"bro!wn fox".title() == u"Bro!Wn Fox"
+        assert u"brow\u4321n fox".title() == u"Brow\u4321N Fox"
+        assert u'\ud800'.title() == u'\ud800'
 
     def test_istitle(self):
         assert u"".istitle() == False
@@ -315,6 +326,18 @@
         assert not u'\u01c5abc'.islower()
         assert not u'\u01c5ABC'.isupper()
 
+    def test_lower_upper(self):
+        assert u'a'.lower() == u'a'
+        assert u'A'.lower() == u'a'
+        assert u'\u0105'.lower() == u'\u0105'
+        assert u'\u0104'.lower() == u'\u0105'
+        assert u'\ud800'.lower() == u'\ud800'
+        assert u'a'.upper() == u'A'
+        assert u'A'.upper() == u'A'
+        assert u'\u0105'.upper() == u'\u0104'
+        assert u'\u0104'.upper() == u'\u0104'
+        assert u'\ud800'.upper() == u'\ud800'
+
     def test_capitalize(self):
         assert u"brown fox".capitalize() == u"Brown fox"
         assert u' hello '.capitalize() == u' hello '
@@ -336,6 +359,8 @@
         # check with Ll chars with no upper - nothing changes here
         assert (u'\u019b\u1d00\u1d86\u0221\u1fb7'.capitalize() ==
                 u'\u019b\u1d00\u1d86\u0221\u1fb7')
+        assert u'\ud800'.capitalize() == u'\ud800'
+        assert u'xx\ud800'.capitalize() == u'Xx\ud800'
 
     def test_rjust(self):
         s = u"abc"
@@ -376,6 +401,16 @@
         assert u'one!two!three!'.replace('x', '@') == u'one!two!three!'
         assert u'one!two!three!'.replace(u'x', '@', 2) == u'one!two!three!'
         assert u'abc'.replace('', u'-') == u'-a-b-c-'
+        assert u'\u1234'.replace(u'', '-') == u'-\u1234-'
+        assert u'\u0234\u5678'.replace('', u'-') == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 0) == u'\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 1) == u'-\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 2) == u'-\u0234-\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 3) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 4) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 700) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -1) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -42) == u'-\u0234-\u5678-'
         assert u'abc'.replace(u'', u'-', 3) == u'-a-b-c'
         assert u'abc'.replace('', '-', 0) == u'abc'
         assert u''.replace(u'', '') == u''
@@ -479,6 +514,9 @@
         assert u''.startswith(u'a') is False
         assert u'x'.startswith(u'xx') is False
         assert u'y'.startswith(u'xx') is False
+        assert u'\u1234\u5678\u4321'.startswith(u'\u1234') is True
+        assert u'\u1234\u5678\u4321'.startswith(u'\u1234\u4321') is False
+        assert u'\u1234'.startswith(u'', 1, 0) is True
 
     def test_startswith_more(self):
         assert u'ab'.startswith(u'a', 0) is True
@@ -589,7 +627,7 @@
         raises(TypeError, u'hello'.translate)
         raises(TypeError, u'abababc'.translate, {ord('a'):''})
 
-    def test_unicode_form_encoded_object(self):
+    def test_unicode_from_encoded_object(self):
         assert unicode('x', 'utf-8') == u'x'
         assert unicode('x', 'utf-8', 'strict') == u'x'
 
@@ -634,6 +672,8 @@
         assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
         assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
         assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+        assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82'
+        assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96'
         assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
         assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
         assert (u'\ud800\udc02'*1000).encode('utf-8') == '\xf0\x90\x80\x82'*1000
@@ -745,6 +785,7 @@
     def test_index(self):
         assert u"rrarrrrrrrrra".index(u'a', 4, None) == 12
         assert u"rrarrrrrrrrra".index(u'a', None, 6) == 2
+        assert u"\u1234\u4321\u5678".index(u'\u5678', 1) == 2
 
     def test_rindex(self):
         from sys import maxint
@@ -754,6 +795,7 @@
         assert u'abcdefghiabc'.rindex(u'abc', 0, -1) == 0
         assert u'abcdefghiabc'.rindex(u'abc', -4*maxint, 4*maxint) == 9
         assert u'rrarrrrrrrrra'.rindex(u'a', 4, None) == 12
+        assert u"\u1234\u5678".rindex(u'\u5678') == 1
 
         raises(ValueError, u'abcdefghiabc'.rindex, u'hib')
         raises(ValueError, u'defghiabc'.rindex, u'def', 1)
@@ -768,12 +810,15 @@
         assert u'abcdefghiabc'.rfind(u'') == 12
         assert u'abcdefghiabc'.rfind(u'abcd') == 0
         assert u'abcdefghiabc'.rfind(u'abcz') == -1
+        assert u"\u1234\u5678".rfind(u'\u5678') == 1
 
     def test_rfind_corner_case(self):
         assert u'abc'.rfind('', 4) == -1
 
     def test_find_index_str_unicode(self):
-        assert 'abcdefghiabc'.find(u'bc') == 1
+        assert u'abcdefghiabc'.find(u'bc') == 1
+        assert u'ab\u0105b\u0107'.find('b', 2) == 3
+        assert u'ab\u0105b\u0107'.find('b', 0, 1) == -1
         assert 'abcdefghiabc'.rfind(u'abc') == 9
         raises(UnicodeDecodeError, '\x80'.find, u'')
         raises(UnicodeDecodeError, '\x80'.rfind, u'')
@@ -781,6 +826,7 @@
         assert 'abcdefghiabc'.rindex(u'abc') == 9
         raises(UnicodeDecodeError, '\x80'.index, u'')
         raises(UnicodeDecodeError, '\x80'.rindex, u'')
+        assert u"\u1234\u5678".find(u'\u5678') == 1
 
     def test_count(self):
         assert u"".count(u"x") ==0
@@ -807,6 +853,7 @@
 
     def test_swapcase(self):
         assert u'\xe4\xc4\xdf'.swapcase() == u'\xc4\xe4\xdf'
+        assert u'\ud800'.swapcase() == u'\ud800'
 
     def test_buffer(self):
         buf = buffer(u'XY')
@@ -878,16 +925,31 @@
 
     def test_getslice(self):
         assert u'123456'.__getslice__(1, 5) == u'2345'
-        s = u"abc"
-        assert s[:] == "abc"
-        assert s[1:] == "bc"
-        assert s[:2] == "ab"
-        assert s[1:2] == "b"
-        assert s[-2:] == "bc"
-        assert s[:-1] == "ab"
-        assert s[-2:2] == "b"
-        assert s[1:-1] == "b"
-        assert s[-2:-1] == "b"
+        s = u"\u0105b\u0107"
+        assert s[:] == u"\u0105b\u0107"
+        assert s[1:] == u"b\u0107"
+        assert s[:2] == u"\u0105b"
+        assert s[1:2] == u"b"
+        assert s[-2:] == u"b\u0107"
+        assert s[:-1] == u"\u0105b"
+        assert s[-2:2] == u"b"
+        assert s[1:-1] == u"b"
+        assert s[-2:-1] == u"b"
+
+    def test_getitem_slice(self):
+        assert u'123456'.__getitem__(slice(1, 5)) == u'2345'
+        s = u"\u0105b\u0107"
+        assert s[slice(3)] == u"\u0105b\u0107"
+        assert s[slice(1, 3)] == u"b\u0107"
+        assert s[slice(2)] == u"\u0105b"
+        assert s[slice(1,2)] == u"b"
+        assert s[slice(-2,3)] == u"b\u0107"
+        assert s[slice(-1)] == u"\u0105b"
+        assert s[slice(-2,2)] == u"b"
+        assert s[slice(1,-1)] == u"b"
+        assert s[slice(-2,-1)] == u"b"
+        assert u"abcde"[::2] == u"ace"
+        assert u"\u0105\u0106\u0107abcd"[::2] == u"\u0105\u0107bd"
 
     def test_no_len_on_str_iter(self):
         iterable = u"hello"