[pypy-commit] pypy cffi-char16-char32: in-progress
arigo
pypy.commits at gmail.com
Sun Jun 4 03:59:01 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: cffi-char16-char32
Changeset: r91504:8bc39f008ba8
Date: 2017-06-04 09:58 +0200
http://bitbucket.org/pypy/pypy/changeset/8bc39f008ba8/
Log: in-progress
diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -36,8 +36,7 @@
datasize = self.size
#
if datasize < 0:
- from pypy.module._cffi_backend import misc
- w_init, length = misc.get_new_array_length(space, w_init)
+ w_init, length = self.get_new_array_length(w_init)
try:
datasize = ovfcheck(length * self.ctitem.size)
except OverflowError:
@@ -53,6 +52,29 @@
self.convert_from_object(ptr, w_init)
return cdata
+ def get_new_array_length(self, w_value):
+ space = self.space
+ if (space.isinstance_w(w_value, space.w_list) or
+ space.isinstance_w(w_value, space.w_tuple)):
+ return (w_value, space.int_w(space.len(w_value)))
+ elif space.isinstance_w(w_value, space.w_bytes):
+ # from a string, we add the null terminator
+ s = space.bytes_w(w_value)
+ return (w_value, len(s) + 1)
+ elif space.isinstance_w(w_value, space.w_unicode):
+ from pypy.module._cffi_backend import wchar_helper
+ u = space.unicode_w(w_value)
+ if self.ctitem.size == 2:
+ length = wchar_helper.unicode_size_as_char16(u)
+ else:
+ length = wchar_helper.unicode_size_as_char32(u)
+ return (w_value, length + 1)
+ else:
+ explicitlength = space.getindex_w(w_value, space.w_OverflowError)
+ if explicitlength < 0:
+ raise oefmt(space.w_ValueError, "negative array length")
+ return (space.w_None, explicitlength)
+
def _check_subscript_index(self, w_cdata, i):
space = self.space
if i < 0:
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -42,12 +42,13 @@
def cast_unicode(self, w_ob):
space = self.space
s = space.unicode_w(w_ob)
- XXXXXXXXXXXXXX
- if len(s) != 1:
+ try:
+ ordinal = wchar_helper.unicode_to_ordinal(s)
+ except ValueError:
raise oefmt(space.w_TypeError,
"cannot cast unicode string of length %d to ctype '%s'",
len(s), self.name)
- return ord(s[0])
+ return intmask(ordinal)
def cast(self, w_ob):
from pypy.module._cffi_backend import ctypeptr
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -4,9 +4,9 @@
from rpython.rlib import rposix
from rpython.rlib.rarithmetic import ovfcheck
-from rpython.rtyper.annlowlevel import llstr, llunicode
+from rpython.rtyper.annlowlevel import llstr
from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw, copy_unicode_to_raw
+from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw
from pypy.interpreter.error import OperationError, oefmt, wrap_oserror
from pypy.module._cffi_backend import cdataobj, misc, ctypeprim, ctypevoid
@@ -88,31 +88,23 @@
if n != self.length:
cdata[n] = '\x00'
elif isinstance(self.ctitem, ctypeprim.W_CTypePrimitiveUniChar):
+ from pypy.module._cffi_backend import wchar_helper
if not space.isinstance_w(w_ob, space.w_unicode):
raise self._convert_error("unicode or list or tuple", w_ob)
s = space.unicode_w(w_ob)
- XXXXXXXXXXXXXXX
- n = len(s)
+ if self.ctitem.size == 2:
+ n = wchar_helper.unicode_size_as_char16(s)
+ else:
+ n = wchar_helper.unicode_size_as_char32(s)
if self.length >= 0 and n > self.length:
raise oefmt(space.w_IndexError,
"initializer unicode string is too long for '%s' "
"(got %d characters)", self.name, n)
-
-
-
-
+ add_final_zero = (n != self.length)
if self.ctitem.size == 2:
- length = wchar_helper.measure_length_16(ptr, length)
+ wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero)
else:
- length = wchar_helper.measure_length_32(ptr, length)
- XXXX
-
-
-
- unichardata = rffi.cast(rffi.CWCHARP, cdata)
- copy_unicode_to_raw(llunicode(s), unichardata, 0, n)
- if n != self.length:
- unichardata[n] = u'\x00'
+ wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero)
else:
raise self._convert_error("list or tuple", w_ob)
@@ -315,10 +307,18 @@
if (space.isinstance_w(w_init, space.w_list) or
space.isinstance_w(w_init, space.w_tuple)):
length = space.int_w(space.len(w_init))
- elif space.isinstance_w(w_init, space.w_basestring):
+ elif space.isinstance_w(w_init, space.w_bytes):
# from a string, we add the null terminator
- XXXXXXXXXXXXXXX
- length = space.int_w(space.len(w_init)) + 1
+ s = space.bytes_w(w_init)
+ length = len(s) + 1
+ elif space.isinstance_w(w_init, space.w_unicode):
+ from pypy.module._cffi_backend import wchar_helper
+ u = space.unicode_w(w_init)
+ if self.ctitem.size == 2:
+ length = wchar_helper.unicode_size_as_char16(u)
+ else:
+ length = wchar_helper.unicode_size_as_char32(u)
+ length += 1
elif self.is_file:
result = self.prepare_file(w_init)
if result:
diff --git a/pypy/module/_cffi_backend/ctypestruct.py b/pypy/module/_cffi_backend/ctypestruct.py
--- a/pypy/module/_cffi_backend/ctypestruct.py
+++ b/pypy/module/_cffi_backend/ctypestruct.py
@@ -244,7 +244,7 @@
ct = self.ctype
if isinstance(ct, ctypearray.W_CTypeArray) and ct.length < 0:
space = ct.space
- w_ob, varsizelength = misc.get_new_array_length(space, w_ob)
+ w_ob, varsizelength = ct.get_new_array_length(w_ob)
if optvarsize != -1:
# in this mode, the only purpose of this function is to compute
# the real size of the structure from a var-sized C99 array
diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -290,22 +290,6 @@
# ____________________________________________________________
-def get_new_array_length(space, w_value):
- if (space.isinstance_w(w_value, space.w_list) or
- space.isinstance_w(w_value, space.w_tuple)):
- return (w_value, space.int_w(space.len(w_value)))
- elif space.isinstance_w(w_value, space.w_basestring):
- # from a string, we add the null terminator
- XXXXXXXXXX
- return (w_value, space.int_w(space.len(w_value)) + 1)
- else:
- explicitlength = space.getindex_w(w_value, space.w_OverflowError)
- if explicitlength < 0:
- raise oefmt(space.w_ValueError, "negative array length")
- return (space.w_None, explicitlength)
-
-# ____________________________________________________________
-
@specialize.arg(0)
def _raw_memcopy_tp(TPP, source, dest):
# in its own function: LONGLONG may make the whole function jit-opaque
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,6 +1,8 @@
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
+from rpython.rtyper.annlowlevel import llunicode
from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
@@ -18,8 +20,7 @@
unichr(0xDC00 | (ordinal & 0x3FF)))
def is_surrogate(u, index):
- return (index + 1 < len(u) and
- unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
+ return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
def as_surrogate(u, index):
@@ -42,9 +43,13 @@
ordinal = intmask(rffi.cast(rffi.INT, ordinal))
self.ordinal = ordinal
+def _unicode_from_wchar(ptr, length):
+ return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
if SIZE_UNICODE == 2:
def unicode_from_char32(ptr, length):
+ # 'ptr' is a pointer to 'length' 32-bit integers
ptr = rffi.cast(rffi.UINTP, ptr)
alloc = length
for i in range(length):
@@ -69,14 +74,13 @@
assert j == len(u)
return u''.join(u)
- def unicode_from_char16(ptr, length):
- return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+ unicode_from_char16 = _unicode_from_wchar
else:
- def unicode_from_char32(ptr, length):
- return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+ unicode_from_char32 = _unicode_from_wchar
def unicode_from_char16(ptr, length):
+ # 'ptr' is a pointer to 'length' 16-bit integers
ptr = rffi.cast(rffi.USHORTP, ptr)
u = [u'\x00'] * length
i = 0
@@ -113,5 +117,71 @@
return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
-def unicode_to_char16(u, ptr):
- XXX
+def unicode_size_as_char16(u):
+ result = len(u)
+ if SIZE_UNICODE == 4:
+ for i in range(result):
+ if ord(u[i]) > 0xFFFF:
+ result += 1
+ return result
+
+def unicode_size_as_char32(u):
+ result = len(u)
+ if SIZE_UNICODE == 2 and result > 1:
+ for i in range(result - 1):
+ if is_surrogate(u, i):
+ result -= 1
+ return result
+
+
+def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
+ # 'target_ptr' is a raw pointer to 'target_length' wchars;
+ # we assume here that target_length == len(u).
+ unichardata = rffi.cast(rffi.CWCHARP, target_ptr)
+ copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length)
+ if add_final_zero:
+ unichardata[target_length] = u'\x00'
+
+
+if SIZE_UNICODE == 2:
+ def unicode_to_char32(u, target_ptr, target_length, add_final_zero):
+ # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
+ # we assume here that target_length == unicode_size_as_char32(u).
+ ptr = rffi.cast(rffi.UINTP, target_ptr)
+ src_index = 0
+ for i in range(target_length):
+ if i < target_length - 1 and is_surrogate(u, src_index):
+ ordinal = as_surrogate(u, src_index)
+ src_index += 2
+ else:
+ ordinal = r_uint(ord(u[src_index]))
+ src_index += 1
+ ptr[i] = rffi.cast(rffi.UINT, ordinal)
+ if add_final_zero:
+ ptr[target_length] = rffi.cast(rffi.UINT, 0)
+
+ unicode_to_char16 = _unicode_to_wchar
+
+else:
+ unicode_to_char32 = _unicode_to_wchar
+
+ def unicode_to_char16(u, target_ptr, target_length, add_final_zero):
+ # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
+ # we assume here that target_length == unicode_size_as_char16(u).
+ ptr = rffi.cast(rffi.USHORTP, target_ptr)
+ for uc in u:
+ ordinal = ord(uc)
+ if ordinal > 0xFFFF:
+ # NB. like CPython, ignore the problem of unicode string
+ # objects containing characters greater than sys.maxunicode
+ ordinal -= 0x10000
+ ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
+ ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
+ ptr = rffi.ptradd(ptr, 2)
+ else:
+ ptr[0] = rffi.cast(rffi.USHORT, ordinal)
+ ptr = rffi.ptradd(ptr, 1)
+ assert ptr == (
+ rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
+ if add_final_zero:
+ ptr[0] = rffi.cast(rffi.USHORT, 0)
More information about the pypy-commit
mailing list