[pypy-commit] pypy unicode-utf8: Some unicode>utf8 conversions in cpyext/unicodeobject.py
rlamy
pypy.commits at gmail.com
Fri Dec 8 08:08:02 EST 2017
Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8
Changeset: r93316:8cc0253e1ece
Date: 2017-12-08 13:07 +0000
http://bitbucket.org/pypy/pypy/changeset/8cc0253e1ece/
Log: Some unicode>utf8 conversions in cpyext/unicodeobject.py
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,10 +1,11 @@
import sys
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.objectmodel import specialize
from rpython.rlib import rutf8
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.rstring import StringBuilder
+from rpython.rtyper.lltypesystem import rffi
from pypy.module._codecs import interp_codecs
@specialize.memo()
@@ -204,7 +205,7 @@
if c > 0x7F:
errorhandler("strict", 'ascii',
'ordinal not in range(128)', utf8,
- pos, pos + 1)
+ pos, pos + 1)
j = rutf8.next_codepoint_pos(r, j)
pos = newpos
res.append(r)
@@ -530,6 +531,19 @@
return builder.build(), pos, outsize
+def wcharpsize2utf8(space, wcharp, size):
+ """Safe version of rffi.wcharpsize2utf8.
+
+ Raises app-level ValueError if any wchar value is outside the valid
+ codepoint range.
+ """
+ try:
+ return rffi.wcharpsize2utf8(wcharp, size)
+ except ValueError:
+ raise oefmt(space.w_ValueError,
+ "character is not in range [U+0000; U+10ffff]")
+
+
# ____________________________________________________________
# Raw unicode escape
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,9 @@
+from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.rlib import rstring, runicode
+from rpython.tool.sourcetools import func_renamer
+
from pypy.interpreter.error import OperationError, oefmt
-from rpython.rtyper.lltypesystem import rffi, lltype
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -13,8 +17,6 @@
from pypy.module.sys.interp_encoding import setdefaultencoding
from pypy.module._codecs.interp_codecs import CodecState
from pypy.objspace.std import unicodeobject
-from rpython.rlib import rstring, runicode
-from rpython.tool.sourcetools import func_renamer
import sys
## See comment in bytesobject.py.
@@ -61,10 +63,10 @@
def unicode_attach(space, py_obj, w_obj, w_userdata=None):
"Fills a newly allocated PyUnicodeObject with a unicode string"
py_unicode = rffi.cast(PyUnicodeObject, py_obj)
- s = space.unicode_w(w_obj)
- py_unicode.c_length = len(s)
+ s, length = space.utf8_len_w(w_obj)
+ py_unicode.c_length = length
py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
- py_unicode.c_hash = space.hash_w(space.newunicode(s))
+ py_unicode.c_hash = space.hash_w(space.newutf8(s, length))
py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
def unicode_realize(space, py_obj):
@@ -73,11 +75,12 @@
be modified after this call.
"""
py_uni = rffi.cast(PyUnicodeObject, py_obj)
- s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
+ length = py_uni.c_length
+ s = wcharpsize2utf8(space, py_uni.c_str, length)
w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
- w_obj.__init__(s)
- py_uni.c_hash = space.hash_w(space.newunicode(s))
+ w_obj.__init__(s, length)
+ py_uni.c_hash = space.hash_w(space.newutf8(s, length))
track_reference(space, py_obj, w_obj)
return w_obj
@@ -214,8 +217,8 @@
if not ref_unicode.c_str:
# Copy unicode buffer
w_unicode = from_ref(space, rffi.cast(PyObject, ref))
- u = space.unicode_w(w_unicode)
- ref_unicode.c_str = rffi.unicode2wcharp(u)
+ u, length = space.utf8_len_w(w_unicode)
+ ref_unicode.c_str = rffi.utf82wcharp(u, length)
return ref_unicode.c_str
@cpython_api([PyObject], rffi.CWCHARP)
@@ -335,8 +338,8 @@
Therefore, modification of the resulting Unicode object is only allowed when u
is NULL."""
if wchar_p:
- s = rffi.wcharpsize2unicode(wchar_p, length)
- return make_ref(space, space.newunicode(s))
+ s = wcharpsize2utf8(space, wchar_p, length)
+ return make_ref(space, space.newutf8(s, length))
else:
return rffi.cast(PyObject, new_empty_unicode(space, length))
@@ -506,7 +509,8 @@
"""Encode the Py_UNICODE buffer of the given size and return a
Python string object. Return NULL if an exception was raised
by the codec."""
- w_u = space.newunicode(rffi.wcharpsize2unicode(s, size))
+ u = wcharpsize2utf8(space, s, size)
+ w_u = space.newutf8(u, size)
if errors:
w_errors = space.newtext(rffi.charp2str(errors))
else:
@@ -706,12 +710,12 @@
"""Return 1 if substr matches str[start:end] at the given tail end
(direction == -1 means to do a prefix match, direction == 1 a
suffix match), 0 otherwise. Return -1 if an error occurred."""
- str = space.unicode_w(w_str)
- substr = space.unicode_w(w_substr)
+ w_start = space.newint(start)
+ w_end = space.newint(end)
if rffi.cast(lltype.Signed, direction) <= 0:
- return rstring.startswith(str, substr, start, end)
+ return space.call_method(w_str, "startswith", w_substr, w_start, w_end)
else:
- return rstring.endswith(str, substr, start, end)
+ return space.call_method(w_str, "endswith", w_substr, w_start, w_end)
@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1)
def PyUnicode_Count(space, w_str, w_substr, start, end):
More information about the pypy-commit
mailing list