[pypy-commit] pypy py3k: Issue1262 in-progress: lone surrogates are invalid in utf-8.

Mon Sep 24 23:46:56 CEST 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r57528:5f5f1528884c
Date: 2012-09-23 17:06 +0200
http://bitbucket.org/pypy/pypy/changeset/5f5f1528884c/

Log:	Issue1262 in-progress: lone surrogates are invalid in utf-8. One
	exception: when encoding, and only for narrow unicode builds. Patch
	by arielby

diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -481,6 +481,7 @@
         assert '\ud84d\udc56'.encode('utf-8') == b'\xf0\xa3\x91\x96'
         raises(UnicodeEncodeError, '\ud800'.encode, 'utf-8')
         raises(UnicodeEncodeError, '\udc00'.encode, 'utf-8')
+        raises(UnicodeEncodeError, '\udc00!'.encode, 'utf-8')
         assert ('\ud800\udc02'*1000).encode('utf-8') == b'\xf0\x90\x80\x82'*1000
         assert (
             '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
@@ -505,6 +506,18 @@
         assert str(b'\xf0\xa3\x91\x96', 'utf-8') == '\U00023456' 
         assert str(b'\xf0\x90\x80\x82', 'utf-8') == '\U00010002' 
         assert str(b'\xe2\x82\xac', 'utf-8') == '\u20ac' 
+        # Invalid Continuation Bytes, EOF
+        raises(UnicodeDecodeError, b'\xc4\x00'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xe2\x82'.decode, 'utf-8')
+        # Non-Canonical Forms
+        raises(UnicodeDecodeError, b'\xc0\x80'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xc1\xbf'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xe0\x9f\xbf'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xf0\x8f\x8f\x84'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xf5\x80\x81\x82'.decode, 'utf-8')
+        raises(UnicodeDecodeError, b'\xf4\x90\x80\x80'.decode, 'utf-8')
+        # CESU-8
+        raises(UnicodeDecodeError, b'\xed\xa0\xbc\xed\xb2\xb1'.decode, 'utf-8') 
 
     def test_codecs_errors(self):
         # Error handling (encoding)
@@ -706,6 +719,8 @@
     def test_encode_raw_unicode_escape(self):
         u = str(b'\\', 'raw_unicode_escape')
         assert u == '\\'
+        s = '\u05d1\u05d3\u05d9\u05e7\u05d4'.encode('raw_unicode_escape')
+        assert s == b'\\u05d1\\u05d3\\u05d9\\u05e7\\u05d4'
 
     def test_decode_from_buffer(self):
         buf = b'character buffers are decoded to unicode'
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -184,8 +184,7 @@
             if (ordch2>>6 != 0x2 or    # 0b10
                 (ordch1 == 0xe0 and ordch2 < 0xa0)
                 # surrogates shouldn't be valid UTF-8!
-                # Uncomment the line below to make them invalid.
-                # or (ordch1 == 0xed and ordch2 > 0x9f)
+                or (ordch1 == 0xed and ordch2 > 0x9f)
                 ):
                 r, pos = errorhandler(errors, 'utf-8',
                                       'invalid continuation byte',
@@ -277,15 +276,16 @@
             # Encode UCS2 Unicode ordinals
             if ch < 0x10000:
                 # Special case: check for high surrogate
-                if 0xD800 <= ch <= 0xDFFF and pos != size:
-                    ch2 = ord(s[pos])
-                    # Check for low surrogate and combine the two to
-                    # form a UCS4 value
-                    if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
-                        ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
-                        pos += 1
-                        _encodeUCS4(result, ch3)
-                        continue
+                if 0xD800 <= ch <= 0xDFFF:
+                    if pos != size:
+                        ch2 = ord(s[pos])
+                        # Check for low surrogate and combine the two to
+                        # form a UCS4 value
+                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+                            ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                            pos += 1
+                            _encodeUCS4(result, ch3)
+                            continue
                     r, pos = errorhandler(errors, 'utf-8',
                                           'surrogates not allowed',
                                           s, pos-1, pos)
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -245,9 +245,8 @@
             self.checkdecode(s, "utf-8")
 
     def test_utf8_surrogate(self):
-        # A surrogate should not be valid utf-8, but python 2.x accepts them.
-        # This test will raise an error with python 3.x
-        self.checkdecode(u"\ud800", "utf-8")
+        # surrogates used to be allowed by python 2.x
+        raises(UnicodeDecodeError, self.checkdecode, u"\ud800", "utf-8")
 
     def test_invalid_start_byte(self):
         """
@@ -694,12 +693,16 @@
             self.checkencode(s, "utf-8")
 
     def test_utf8_surrogates(self):
-        # check replacing of two surrogates by single char while encoding
         # make sure that the string itself is not marshalled
         u = u"\ud800"
         for i in range(4):
             u += u"\udc00"
-        self.checkencode(u, "utf-8")
+        if runicode.MAXUNICODE < 65536:
+            # Check replacing of two surrogates by single char while encoding
+            self.checkencode(u, "utf-8")
+        else:
+            # This is not done in wide unicode builds
+            raises(UnicodeEncodeError, self.checkencode, u, "utf-8")
 
     def test_ascii_error(self):
         self.checkencodeerror(u"abc\xFF\xFF\xFFcde", "ascii", 3, 6)