[pypy-commit] pypy unicode-utf8: Fixes for _cffi_backend
arigo
pypy.commits at gmail.com
Thu Nov 23 09:41:28 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93137:a94b5860dbb3
Date: 2017-11-23 15:40 +0100
http://bitbucket.org/pypy/pypy/changeset/a94b5860dbb3/
Log: Fixes for _cffi_backend
diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -64,13 +64,10 @@
elif space.isinstance_w(w_value, space.w_unicode):
from pypy.module._cffi_backend import wchar_helper
w_u = space.convert_arg_to_w_unicode(w_value)
- if self.citem.size == 4:
+ if self.ctitem.size == 2:
+ length = wchar_helper.utf8_size_as_char16(w_u._utf8)
+ else:
length = w_u._len()
- else:
- if not w_u._has_surrogates():
- length = w_u._len()
- else:
- length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len())
return (w_value, length + 1)
else:
explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -40,16 +40,13 @@
return ord(s[0])
def cast_unicode(self, w_ob):
- import pdb
- pdb.set_trace()
space = self.space
w_u = space.convert_arg_to_w_unicode(w_ob)
if w_u._len() != 1:
raise oefmt(space.w_TypeError,
"cannot cast unicode string of length %d to ctype '%s'",
w_u._len(), self.name)
- ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
- return intmask(ordinal)
+ return rutf8.codepoint_at_pos(w_u._utf8, 0)
def cast(self, w_ob):
from pypy.module._cffi_backend import ctypeptr
@@ -175,21 +172,19 @@
return self.space.newint(value) # r_uint => 'long' object
def convert_to_object(self, cdata):
- if self.is_signed_wchar:
- code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
- return self.space.newutf8(
- rutf8.unichr_as_utf8(code), 1,
- rutf8.get_flag_from_code(code))
- else:
- value = misc.read_raw_ulong_data(cdata, self.size) # r_uint
- try:
- u = wchar_helper.ordinal_to_unicode(value)
- except wchar_helper.OutOfRange as e:
- raise oefmt(self.space.w_ValueError,
- "char32_t out of range for "
- "conversion to unicode: %s", hex(e.ordinal))
- return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
- rutf8.get_flag_from_code(ord(u)))
+ value = misc.read_raw_ulong_data(cdata, self.size) # r_uint
+ try:
+ utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True)
+ except ValueError:
+ if self.is_signed_wchar:
+ s = hex(intmask(value))
+ else:
+ s = hex(value)
+ raise oefmt(self.space.w_ValueError,
+ "%s out of range for conversion to unicode: %s",
+ self.name, s)
+ flag = rutf8.get_flag_from_code(intmask(value))
+ return self.space.newutf8(utf8, 1, flag)
def string(self, cdataobj, maxlen):
with cdataobj as ptr:
@@ -200,7 +195,13 @@
# returns a r_uint. If self.size == 2, it is smaller than 0x10000
space = self.space
if space.isinstance_w(w_ob, space.w_unicode):
- return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
+ w_u = space.convert_arg_to_w_unicode(w_ob)
+ if w_u._len() != 1:
+ raise self._convert_error("single character", w_ob)
+ ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
+ if self.size == 2 and ordinal > 0xFFFF:
+ raise self._convert_error("single character <= 0xFFFF", w_ob)
+ return r_uint(ordinal)
elif (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
w_ob.ctype.size == self.size):
@@ -214,15 +215,15 @@
def unpack_ptr(self, w_ctypeptr, ptr, length):
if self.size == 2:
- u = wchar_helper.unicode_from_char16(ptr, length)
+ utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
else:
try:
- u = wchar_helper.unicode_from_char32(ptr, length)
+ utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
except wchar_helper.OutOfRange as e:
raise oefmt(self.space.w_ValueError,
- "char32_t out of range for "
- "conversion to unicode: %s", hex(e.ordinal))
- return self.space.newunicode(u)
+ "%s out of range for conversion to unicode: %s",
+ self.name, hex(e.ordinal))
+ return self.space.newutf8(utf8, lgt, flag)
class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -92,28 +92,20 @@
if not space.isinstance_w(w_ob, space.w_unicode):
raise self._convert_error("unicode or list or tuple", w_ob)
w_u = space.convert_arg_to_w_unicode(w_ob)
- if self.size == 4:
+ s = w_u._utf8
+ if self.ctitem.size == 2:
+ n = wchar_helper.utf8_size_as_char16(s)
+ else:
n = w_u._len()
- else:
- if not w_u._has_surrogates():
- n = w_u._len()
- else:
- n = wchar_helper.unicode_size_as_char16(w_u._utf8,
- w_u._len())
if self.length >= 0 and n > self.length:
raise oefmt(space.w_IndexError,
"initializer unicode string is too long for '%s' "
"(got %d characters)", self.name, n)
add_final_zero = (n != self.length)
if self.ctitem.size == 2:
- try:
- wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero)
- except wchar_helper.OutOfRange as e:
- raise oefmt(self.space.w_ValueError,
- "unicode character ouf of range for "
- "conversion to char16_t: %s", hex(e.ordinal))
+ wchar_helper.utf8_to_char16(s, cdata, n, add_final_zero)
else:
- wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero)
+ wchar_helper.utf8_to_char32(s, cdata, n, add_final_zero)
else:
raise self._convert_error("list or tuple", w_ob)
@@ -334,8 +326,7 @@
from pypy.module._cffi_backend import wchar_helper
w_u = space.convert_arg_to_w_unicode(w_init)
if self.ctitem.size == 2:
- length = wchar_helper.unicode_size_as_char16(w_u._utf8,
- w_u._len())
+ length = wchar_helper.utf8_size_as_char16(w_u._utf8)
else:
length = w_u._len()
length += 1
diff --git a/pypy/module/_cffi_backend/test/test_wchar_helper.py b/pypy/module/_cffi_backend/test/test_wchar_helper.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_cffi_backend/test/test_wchar_helper.py
@@ -0,0 +1,10 @@
+from hypothesis import given, strategies
+from pypy.module._cffi_backend.wchar_helper import utf8_size_as_char16
+
+
+
+ at given(strategies.text())
+def test_utf8_size_as_char16(u):
+ assert type(u) is unicode
+ length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
+ assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -6,41 +6,6 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
-SIZE_UNICODE = 4
-
-
-if SIZE_UNICODE == 4:
- def ordinal_to_unicode(ordinal): # 'ordinal' is a r_uint
- return unichr(intmask(ordinal))
-else:
- def ordinal_to_unicode(ordinal): # 'ordinal' is a r_uint
- if ordinal <= 0xffff:
- return unichr(intmask(ordinal))
- elif ordinal <= 0x10ffff:
- ordinal = intmask(ordinal - 0x10000)
- return (unichr(0xD800 | (ordinal >> 10)) +
- unichr(0xDC00 | (ordinal & 0x3FF)))
- else:
- raise OutOfRange(ordinal)
-
-def is_surrogate(u, index):
- return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
- unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
-
-def as_surrogate(u, index):
- ordinal = (ord(u[index + 0]) - 0xD800) << 10
- ordinal |= (ord(u[index + 1]) - 0xDC00)
- return r_uint(ordinal + 0x10000)
-
-def unicode_to_ordinal(u):
- if len(u) == 1:
- u = ord(u[0])
- return r_uint(u)
- elif SIZE_UNICODE == 2:
- if len(u) == 2 and is_surrogate(u, 0):
- return r_uint(as_surrogate(u, 0))
- raise ValueError
-
class OutOfRange(Exception):
ordinal = 0
@@ -49,59 +14,41 @@
ordinal = intmask(rffi.cast(rffi.INT, ordinal))
self.ordinal = ordinal
-def _unicode_from_wchar(ptr, length):
- return rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, ptr), length)
+def utf8_from_char32(ptr, length):
+ # 'ptr' is a pointer to 'length' 32-bit integers
+ ptr = rffi.cast(rffi.UINTP, ptr)
+ u = StringBuilder(length)
+ j = 0
+ flag = rutf8.FLAG_ASCII
+ while j < length:
+ ch = intmask(ptr[j])
+ j += 1
+ flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
+ try:
+ rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
+ except ValueError:
+ raise OutOfRange(ch)
+ return u.build(), length, flag
-
-if SIZE_UNICODE == 2:
- def unicode_from_char32(ptr, length):
- # 'ptr' is a pointer to 'length' 32-bit integers
- ptr = rffi.cast(rffi.UINTP, ptr)
- alloc = length
- for i in range(length):
- if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF:
- alloc += 1
-
- u = [u'\x00'] * alloc
- j = 0
- for i in range(length):
- ordinal = rffi.cast(lltype.Unsigned, ptr[i])
- if ordinal > 0xFFFF:
- if ordinal > 0x10FFFF:
- raise OutOfRange(ordinal)
- ordinal = intmask(ordinal - 0x10000)
- u[j] = unichr(0xD800 | (ordinal >> 10))
+def utf8_from_char16(ptr, length):
+ # 'ptr' is a pointer to 'length' 16-bit integers
+ ptr = rffi.cast(rffi.USHORTP, ptr)
+ u = StringBuilder(length)
+ j = 0
+ result_length = length
+ flag = rutf8.FLAG_ASCII
+ while j < length:
+ ch = intmask(ptr[j])
+ j += 1
+ if 0xD800 <= ch <= 0xDBFF and j < length:
+ ch2 = intmask(ptr[j])
+ if 0xDC00 <= ch2 <= 0xDFFF:
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
j += 1
- u[j] = unichr(0xDC00 | (ordinal & 0x3FF))
- j += 1
- else:
- u[j] = unichr(intmask(ordinal))
- j += 1
- assert j == len(u)
- return u''.join(u)
-
- unicode_from_char16 = _unicode_from_wchar
-
-else:
- unicode_from_char32 = _unicode_from_wchar
-
- def unicode_from_char16(ptr, length):
- # 'ptr' is a pointer to 'length' 16-bit integers
- ptr = rffi.cast(rffi.USHORTP, ptr)
- u = StringBuilder(length)
- i = 0
- j = 0
- while j < length:
- ch = intmask(ptr[j])
- j += 1
- if 0xD800 <= ch <= 0xDBFF and j < length:
- ch2 = intmask(ptr[j])
- if 0xDC00 <= ch2 <= 0xDFFF:
- ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
- j += 1
- rutf8.unichr_as_utf8_append(u, ch)
- i += 1
- return u.build()
+ result_length -= 1
+ flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
+ rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
+ return u.build(), result_length, flag
@specialize.ll()
@@ -122,65 +69,44 @@
return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
-def unicode_size_as_char16(u, len):
- result = len
- i = 0
- while i < len(u):
- code = rutf8.codepoint_at_pos(u, i)
- if code > 0xFFFF:
- result += 1
- i = rutf8.next_codepoint_pos(u, i)
+def utf8_size_as_char16(u):
+ # Counts one per unichar in 'u', or two if they are greater than 0xffff.
+ TABLE = "\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x02"
+ result = 0
+ for c in u:
+ result += ord(TABLE[ord(c) >> 4])
return result
-def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
- # 'target_ptr' is a raw pointer to 'target_length' wchars;
- # we assume here that target_length == len(u).
- unichardata = rffi.cast(rffi.CWCHARP, target_ptr)
- copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length)
+def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero):
+ # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
+ # we assume (and check) that target_length == number of unichars in utf8.
+ unichardata = rffi.cast(rffi.UINTP, target_ptr)
+ i = 0
+ for j in range(target_length):
+ code = rutf8.codepoint_at_pos(utf8, i)
+ unichardata[j] = rffi.cast(rffi.UINT, code)
+ i = rutf8.next_codepoint_pos(utf8, i)
+ assert i == len(utf8)
if add_final_zero:
- unichardata[target_length] = u'\x00'
+ unichardata[target_length] = rffi.cast(rffi.UINT, 0)
-
-if SIZE_UNICODE == 2:
- def unicode_to_char32(u, target_ptr, target_length, add_final_zero):
- # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
- # we assume here that target_length == unicode_size_as_char32(u).
- ptr = rffi.cast(rffi.UINTP, target_ptr)
- src_index = 0
- last_surrogate_pos = len(u) - 2
- for i in range(target_length):
- if src_index <= last_surrogate_pos and is_surrogate(u, src_index):
- ordinal = as_surrogate(u, src_index)
- src_index += 2
- else:
- ordinal = r_uint(ord(u[src_index]))
- src_index += 1
- ptr[i] = rffi.cast(rffi.UINT, ordinal)
- if add_final_zero:
- ptr[target_length] = rffi.cast(rffi.UINT, 0)
-
- unicode_to_char16 = _unicode_to_wchar
-
-else:
- unicode_to_char32 = _unicode_to_wchar
-
- def unicode_to_char16(u, target_ptr, target_length, add_final_zero):
- # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
- # we assume here that target_length == unicode_size_as_char16(u).
- ptr = rffi.cast(rffi.USHORTP, target_ptr)
- for uc in u:
- ordinal = ord(uc)
- if ordinal > 0xFFFF:
- if ordinal > 0x10FFFF:
- raise OutOfRange(ordinal)
- ordinal -= 0x10000
- ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
- ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
- ptr = rffi.ptradd(ptr, 2)
- else:
- ptr[0] = rffi.cast(rffi.USHORT, ordinal)
- ptr = rffi.ptradd(ptr, 1)
- assert ptr == (
- rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
- if add_final_zero:
- ptr[0] = rffi.cast(rffi.USHORT, 0)
+def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero):
+ # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
+ # we assume (and check) that target_length == utf8_size_as_char16(utf8).
+ ptr = rffi.cast(rffi.USHORTP, target_ptr)
+ i = 0
+ while i < len(utf8):
+ ordinal = rutf8.codepoint_at_pos(utf8, i)
+ if ordinal > 0xFFFF:
+ ordinal -= 0x10000
+ ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
+ ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
+ ptr = rffi.ptradd(ptr, 2)
+ else:
+ ptr[0] = rffi.cast(rffi.USHORT, ordinal)
+ ptr = rffi.ptradd(ptr, 1)
+ i = rutf8.next_codepoint_pos(utf8, i)
+ assert ptr == (
+ rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
+ if add_final_zero:
+ ptr[0] = rffi.cast(rffi.USHORT, 0)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -453,6 +453,7 @@
)))))
def get_flag_from_code(oc):
+ assert isinstance(oc, int)
if oc <= 0x7F:
return FLAG_ASCII
if 0xD800 <= oc <= 0xDFFF:
More information about the pypy-commit
mailing list