[pypy-commit] pypy default: Issue1285: Python2 allows lone surrogates, also in string literals which appear in marshalled code.
amauryfa
noreply at buildbot.pypy.org
Thu Oct 11 00:53:38 CEST 2012
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r57996:ce4e0ff9862b
Date: 2012-10-11 00:51 +0200
http://bitbucket.org/pypy/pypy/changeset/ce4e0ff9862b/
Log: Issue1285: Python2 allows lone surrogates, also in string literals
which appear in marshalled code.
Also use more direct code for functions that are often used.
diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py
--- a/pypy/interpreter/generator.py
+++ b/pypy/interpreter/generator.py
@@ -3,7 +3,6 @@
from pypy.interpreter.gateway import NoneNotWrapped
from pypy.interpreter.pyopcode import LoopBlock
from pypy.rlib import jit
-from pypy.rlib.objectmodel import specialize
class GeneratorIterator(Wrappable):
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,10 +1,62 @@
+from pypy.interpreter.error import OperationError
+from pypy.rlib.objectmodel import specialize
+from pypy.rlib import runicode
from pypy.module._codecs import interp_codecs
+ at specialize.memo()
+def decode_error_handler(space):
+ def raise_unicode_exception_decode(errors, encoding, msg, s,
+ startingpos, endingpos):
+ raise OperationError(space.w_UnicodeDecodeError,
+ space.newtuple([space.wrap(encoding),
+ space.wrap(s),
+ space.wrap(startingpos),
+ space.wrap(endingpos),
+ space.wrap(msg)]))
+ return raise_unicode_exception_decode
+
+ at specialize.memo()
+def encode_error_handler(space):
+ def raise_unicode_exception_encode(errors, encoding, msg, u,
+ startingpos, endingpos):
+ raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.wrap(encoding),
+ space.wrap(u),
+ space.wrap(startingpos),
+ space.wrap(endingpos),
+ space.wrap(msg)]))
+ return raise_unicode_exception_encode
+
+# ____________________________________________________________
+
def PyUnicode_AsEncodedString(space, w_data, w_encoding):
return interp_codecs.encode(space, w_data, w_encoding)
# These functions take and return unwrapped rpython strings and unicodes
-PyUnicode_DecodeUnicodeEscape = interp_codecs.make_raw_decoder('unicode_escape')
-PyUnicode_DecodeRawUnicodeEscape = interp_codecs.make_raw_decoder('raw_unicode_escape')
-PyUnicode_DecodeUTF8 = interp_codecs.make_raw_decoder('utf_8')
-PyUnicode_EncodeUTF8 = interp_codecs.make_raw_encoder('utf_8')
+def PyUnicode_DecodeUnicodeEscape(space, string):
+ state = space.fromcache(interp_codecs.CodecState)
+ unicodedata_handler = state.get_unicodedata_handler(space)
+ result, consumed = runicode.str_decode_unicode_escape(
+ string, len(string), "strict",
+ final=True, errorhandler=decode_error_handler(space),
+ unicodedata_handler=unicodedata_handler)
+ return result
+
+def PyUnicode_DecodeRawUnicodeEscape(space, string):
+ result, consumed = runicode.str_decode_raw_unicode_escape(
+ string, len(string), "strict",
+ final=True, errorhandler=decode_error_handler(space))
+ return result
+
+def PyUnicode_DecodeUTF8(space, string):
+ result, consumed = runicode.str_decode_utf_8(
+ string, len(string), "strict",
+ final=True, errorhandler=decode_error_handler(space),
+ allow_surrogates=True)
+ return result
+
+def PyUnicode_EncodeUTF8(space, uni):
+ return runicode.unicode_encode_utf_8(
+ uni, len(uni), "strict",
+ errorhandler=encode_error_handler(space),
+ allow_surrogates=True)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -339,38 +339,6 @@
from pypy.rlib import runicode
-def make_raw_encoder(name):
- rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
- assert hasattr(runicode, rname)
- def raw_encoder(space, uni):
- state = space.fromcache(CodecState)
- func = getattr(runicode, rname)
- errors = "strict"
- return func(uni, len(uni), errors, state.encode_error_handler)
- raw_encoder.func_name = rname
- return raw_encoder
-
-def make_raw_decoder(name):
- rname = "str_decode_%s" % (name.replace("_decode", ""), )
- assert hasattr(runicode, rname)
- def raw_decoder(space, string):
- final = True
- errors = "strict"
- state = space.fromcache(CodecState)
- func = getattr(runicode, rname)
- kwargs = {}
- if name == 'unicode_escape':
- unicodedata_handler = state.get_unicodedata_handler(space)
- result, consumed = func(string, len(string), errors,
- final, state.decode_error_handler,
- unicodedata_handler=unicodedata_handler)
- else:
- result, consumed = func(string, len(string), errors,
- final, state.decode_error_handler)
- return result
- raw_decoder.func_name = rname
- return raw_decoder
-
def make_encoder_wrapper(name):
rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
assert hasattr(runicode, rname)
diff --git a/pypy/module/marshal/test/test_marshal.py b/pypy/module/marshal/test/test_marshal.py
--- a/pypy/module/marshal/test/test_marshal.py
+++ b/pypy/module/marshal/test/test_marshal.py
@@ -163,6 +163,7 @@
def test_unicode(self):
import marshal, sys
self.marshal_check(u'\uFFFF')
+ self.marshal_check(u'\ud800')
self.marshal_check(unichr(sys.maxunicode))
diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py
--- a/pypy/objspace/std/unicodetype.py
+++ b/pypy/objspace/std/unicodetype.py
@@ -1,5 +1,5 @@
from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter import gateway
+from pypy.interpreter import gateway, unicodehelper
from pypy.objspace.std.stdtypedef import StdTypeDef, SMM
from pypy.objspace.std.register_all import register_all
from pypy.objspace.std.basestringtype import basestring_typedef
@@ -186,32 +186,6 @@
# ____________________________________________________________
-def decode_error_handler(space):
- def raise_unicode_exception_decode(errors, encoding, msg, s,
- startingpos, endingpos):
- raise OperationError(space.w_UnicodeDecodeError,
- space.newtuple([space.wrap(encoding),
- space.wrap(s),
- space.wrap(startingpos),
- space.wrap(endingpos),
- space.wrap(msg)]))
- return raise_unicode_exception_decode
-decode_error_handler._annspecialcase_ = 'specialize:memo'
-
-def encode_error_handler(space):
- def raise_unicode_exception_encode(errors, encoding, msg, u,
- startingpos, endingpos):
- raise OperationError(space.w_UnicodeEncodeError,
- space.newtuple([space.wrap(encoding),
- space.wrap(u),
- space.wrap(startingpos),
- space.wrap(endingpos),
- space.wrap(msg)]))
- return raise_unicode_exception_encode
-encode_error_handler._annspecialcase_ = 'specialize:memo'
-
-# ____________________________________________________________
-
def getdefaultencoding(space):
return space.sys.defaultencoding
@@ -235,12 +209,12 @@
if errors is None or errors == 'strict':
if encoding == 'ascii':
u = space.unicode_w(w_object)
- eh = encode_error_handler(space)
+ eh = unicodehelper.encode_error_handler(space)
return space.wrap(unicode_encode_ascii(
u, len(u), None, errorhandler=eh))
if encoding == 'utf-8':
u = space.unicode_w(w_object)
- eh = encode_error_handler(space)
+ eh = unicodehelper.encode_error_handler(space)
return space.wrap(unicode_encode_utf_8(
u, len(u), None, errorhandler=eh,
allow_surrogates=True))
@@ -265,12 +239,12 @@
if encoding == 'ascii':
# XXX error handling
s = space.bufferstr_w(w_obj)
- eh = decode_error_handler(space)
+ eh = unicodehelper.decode_error_handler(space)
return space.wrap(str_decode_ascii(
s, len(s), None, final=True, errorhandler=eh)[0])
if encoding == 'utf-8':
s = space.bufferstr_w(w_obj)
- eh = decode_error_handler(space)
+ eh = unicodehelper.decode_error_handler(space)
return space.wrap(str_decode_utf_8(
s, len(s), None, final=True, errorhandler=eh,
allow_surrogates=True)[0])
More information about the pypy-commit
mailing list