[pypy-commit] pypy default: Issue1285: Python2 allows lone surrogates, also in string literals which appear in marshalled code.

Thu Oct 11 00:53:38 CEST 2012

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r57996:ce4e0ff9862b
Date: 2012-10-11 00:51 +0200
http://bitbucket.org/pypy/pypy/changeset/ce4e0ff9862b/

Log:	Issue1285: Python2 allows lone surrogates, also in string literals
	which appear in marshalled code.

	Also use more direct code for functions that are often used.

diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py
--- a/pypy/interpreter/generator.py
+++ b/pypy/interpreter/generator.py
@@ -3,7 +3,6 @@
 from pypy.interpreter.gateway import NoneNotWrapped
 from pypy.interpreter.pyopcode import LoopBlock
 from pypy.rlib import jit
-from pypy.rlib.objectmodel import specialize
 
 
 class GeneratorIterator(Wrappable):
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,10 +1,62 @@
+from pypy.interpreter.error import OperationError
+from pypy.rlib.objectmodel import specialize
+from pypy.rlib import runicode
 from pypy.module._codecs import interp_codecs
 
+ at specialize.memo()
+def decode_error_handler(space):
+    def raise_unicode_exception_decode(errors, encoding, msg, s,
+                                       startingpos, endingpos):
+        raise OperationError(space.w_UnicodeDecodeError,
+                             space.newtuple([space.wrap(encoding),
+                                             space.wrap(s),
+                                             space.wrap(startingpos),
+                                             space.wrap(endingpos),
+                                             space.wrap(msg)]))
+    return raise_unicode_exception_decode
+
+ at specialize.memo()
+def encode_error_handler(space):
+    def raise_unicode_exception_encode(errors, encoding, msg, u,
+                                       startingpos, endingpos):
+        raise OperationError(space.w_UnicodeEncodeError,
+                             space.newtuple([space.wrap(encoding),
+                                             space.wrap(u),
+                                             space.wrap(startingpos),
+                                             space.wrap(endingpos),
+                                             space.wrap(msg)]))
+    return raise_unicode_exception_encode
+
+# ____________________________________________________________
+
 def PyUnicode_AsEncodedString(space, w_data, w_encoding):
     return interp_codecs.encode(space, w_data, w_encoding)
 
 # These functions take and return unwrapped rpython strings and unicodes
-PyUnicode_DecodeUnicodeEscape = interp_codecs.make_raw_decoder('unicode_escape')
-PyUnicode_DecodeRawUnicodeEscape = interp_codecs.make_raw_decoder('raw_unicode_escape')
-PyUnicode_DecodeUTF8 = interp_codecs.make_raw_decoder('utf_8')
-PyUnicode_EncodeUTF8 = interp_codecs.make_raw_encoder('utf_8')
+def PyUnicode_DecodeUnicodeEscape(space, string):
+    state = space.fromcache(interp_codecs.CodecState)
+    unicodedata_handler = state.get_unicodedata_handler(space)
+    result, consumed = runicode.str_decode_unicode_escape(
+        string, len(string), "strict",
+        final=True, errorhandler=decode_error_handler(space),
+        unicodedata_handler=unicodedata_handler)
+    return result
+
+def PyUnicode_DecodeRawUnicodeEscape(space, string):
+    result, consumed = runicode.str_decode_raw_unicode_escape(
+        string, len(string), "strict",
+        final=True, errorhandler=decode_error_handler(space))
+    return result
+
+def PyUnicode_DecodeUTF8(space, string):
+    result, consumed = runicode.str_decode_utf_8(
+        string, len(string), "strict",
+        final=True, errorhandler=decode_error_handler(space),
+        allow_surrogates=True)
+    return result
+
+def PyUnicode_EncodeUTF8(space, uni):
+    return runicode.unicode_encode_utf_8(
+        uni, len(uni), "strict",
+        errorhandler=encode_error_handler(space),
+        allow_surrogates=True)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -339,38 +339,6 @@
 
 from pypy.rlib import runicode
 
-def make_raw_encoder(name):
-    rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
-    assert hasattr(runicode, rname)
-    def raw_encoder(space, uni):
-        state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
-        errors = "strict"
-        return func(uni, len(uni), errors, state.encode_error_handler)
-    raw_encoder.func_name = rname
-    return raw_encoder
-
-def make_raw_decoder(name):
-    rname = "str_decode_%s" % (name.replace("_decode", ""), )
-    assert hasattr(runicode, rname)
-    def raw_decoder(space, string):
-        final = True
-        errors = "strict"
-        state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
-        kwargs = {}
-        if name == 'unicode_escape':
-            unicodedata_handler = state.get_unicodedata_handler(space)
-            result, consumed = func(string, len(string), errors,
-                                    final, state.decode_error_handler,
-                                    unicodedata_handler=unicodedata_handler)
-        else:
-            result, consumed = func(string, len(string), errors,
-                                    final, state.decode_error_handler)
-        return result
-    raw_decoder.func_name = rname
-    return raw_decoder
-
 def make_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
     assert hasattr(runicode, rname)
diff --git a/pypy/module/marshal/test/test_marshal.py b/pypy/module/marshal/test/test_marshal.py
--- a/pypy/module/marshal/test/test_marshal.py
+++ b/pypy/module/marshal/test/test_marshal.py
@@ -163,6 +163,7 @@
     def test_unicode(self):
         import marshal, sys
         self.marshal_check(u'\uFFFF')
+        self.marshal_check(u'\ud800')
 
         self.marshal_check(unichr(sys.maxunicode))
 
diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py
--- a/pypy/objspace/std/unicodetype.py
+++ b/pypy/objspace/std/unicodetype.py
@@ -1,5 +1,5 @@
 from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter import gateway
+from pypy.interpreter import gateway, unicodehelper
 from pypy.objspace.std.stdtypedef import StdTypeDef, SMM
 from pypy.objspace.std.register_all import register_all
 from pypy.objspace.std.basestringtype import basestring_typedef
@@ -186,32 +186,6 @@
 
 # ____________________________________________________________
 
-def decode_error_handler(space):
-    def raise_unicode_exception_decode(errors, encoding, msg, s,
-                                       startingpos, endingpos):
-        raise OperationError(space.w_UnicodeDecodeError,
-                             space.newtuple([space.wrap(encoding),
-                                             space.wrap(s),
-                                             space.wrap(startingpos),
-                                             space.wrap(endingpos),
-                                             space.wrap(msg)]))
-    return raise_unicode_exception_decode
-decode_error_handler._annspecialcase_ = 'specialize:memo'
-
-def encode_error_handler(space):
-    def raise_unicode_exception_encode(errors, encoding, msg, u,
-                                       startingpos, endingpos):
-        raise OperationError(space.w_UnicodeEncodeError,
-                             space.newtuple([space.wrap(encoding),
-                                             space.wrap(u),
-                                             space.wrap(startingpos),
-                                             space.wrap(endingpos),
-                                             space.wrap(msg)]))
-    return raise_unicode_exception_encode
-encode_error_handler._annspecialcase_ = 'specialize:memo'
-
-# ____________________________________________________________
-
 def getdefaultencoding(space):
     return space.sys.defaultencoding
 
@@ -235,12 +209,12 @@
         if errors is None or errors == 'strict':
             if encoding == 'ascii':
                 u = space.unicode_w(w_object)
-                eh = encode_error_handler(space)
+                eh = unicodehelper.encode_error_handler(space)
                 return space.wrap(unicode_encode_ascii(
                         u, len(u), None, errorhandler=eh))
             if encoding == 'utf-8':
                 u = space.unicode_w(w_object)
-                eh = encode_error_handler(space)
+                eh = unicodehelper.encode_error_handler(space)
                 return space.wrap(unicode_encode_utf_8(
                         u, len(u), None, errorhandler=eh,
                         allow_surrogates=True))
@@ -265,12 +239,12 @@
         if encoding == 'ascii':
             # XXX error handling
             s = space.bufferstr_w(w_obj)
-            eh = decode_error_handler(space)
+            eh = unicodehelper.decode_error_handler(space)
             return space.wrap(str_decode_ascii(
                     s, len(s), None, final=True, errorhandler=eh)[0])
         if encoding == 'utf-8':
             s = space.bufferstr_w(w_obj)
-            eh = decode_error_handler(space)
+            eh = unicodehelper.decode_error_handler(space)
             return space.wrap(str_decode_utf_8(
                     s, len(s), None, final=True, errorhandler=eh,
                     allow_surrogates=True)[0])