[pypy-commit] pypy unicode-utf8: whack at cffi
fijal
pypy.commits at gmail.com
Tue Nov 21 15:09:40 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93119:c6537b6d453f
Date: 2017-11-21 21:09 +0100
http://bitbucket.org/pypy/pypy/changeset/c6537b6d453f/
Log: whack at cffi
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -7,3 +7,5 @@
* better flag handling in split/splitlines maybe?
* encode_error_handler has XXX
* remove assertions from W_UnicodeObject.__init__ if all the builders pass
+* what to do with error handlers that go backwards. There were tests
+ in test_codecs that would check for that
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1773,6 +1773,13 @@
"characters")
return rstring.assert_str0(result)
+ def convert_arg_to_w_unicode(self, w_obj, strict=None):
+ # XXX why convert_to_w_unicode does something slightly different?
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ assert not hasattr(self, 'is_fake_objspace')
+ return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict)
+
+
def realutf8_w(self, w_obj):
# Like utf8_w(), but only works if w_obj is really of type
# 'unicode'. On Python 3 this is the same as utf8_w().
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -35,9 +35,7 @@
return raise_unicode_exception_encode
def convert_arg_to_w_unicode(space, w_arg, strict=None):
- from pypy.objspace.std.unicodeobject import W_UnicodeObject
- assert not hasattr(space, 'is_fake_objspace')
- return W_UnicodeObject.convert_arg_to_w_unicode(space, w_arg, strict)
+ return space.convert_arg_to_w_unicode(w_arg)
# ____________________________________________________________
diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -63,11 +63,14 @@
return (w_value, len(s) + 1)
elif space.isinstance_w(w_value, space.w_unicode):
from pypy.module._cffi_backend import wchar_helper
- u = space.unicode_w(w_value)
- if self.ctitem.size == 2:
- length = wchar_helper.unicode_size_as_char16(u)
+ w_u = space.convert_arg_to_w_unicode(w_value)
+ if self.citem.size == 4:
+ length = w_u._len()
else:
- length = wchar_helper.unicode_size_as_char32(u)
+ if not w_u._has_surrogates():
+ length = w_u._len()
+ else:
+ length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len())
return (w_value, length + 1)
else:
explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -5,7 +5,7 @@
import sys
from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rtyper.tool import rfficache
@@ -40,14 +40,15 @@
return ord(s[0])
def cast_unicode(self, w_ob):
+ import pdb
+ pdb.set_trace()
space = self.space
- s = space.unicode_w(w_ob)
- try:
- ordinal = wchar_helper.unicode_to_ordinal(s)
- except ValueError:
+ w_u = space.convert_arg_to_w_unicode(w_ob)
+ if w_u._len() != 1:
raise oefmt(space.w_TypeError,
"cannot cast unicode string of length %d to ctype '%s'",
- len(s), self.name)
+ w_u._len(), self.name)
+ ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
return intmask(ordinal)
def cast(self, w_ob):
@@ -175,8 +176,10 @@
def convert_to_object(self, cdata):
if self.is_signed_wchar:
- unichardata = rffi.cast(rffi.CWCHARP, cdata)
- return self.space.newunicode(unichardata[0])
+ code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
+ return self.space.newutf8(
+ rutf8.unichr_as_utf8(code), 1,
+ rutf8.get_flag_from_code(code))
else:
value = misc.read_raw_ulong_data(cdata, self.size) # r_uint
try:
@@ -185,7 +188,8 @@
raise oefmt(self.space.w_ValueError,
"char32_t out of range for "
"conversion to unicode: %s", hex(e.ordinal))
- return self.space.newunicode(u)
+ return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
+ rutf8.get_flag_from_code(ord(u)))
def string(self, cdataobj, maxlen):
with cdataobj as ptr:
@@ -196,16 +200,7 @@
# returns a r_uint. If self.size == 2, it is smaller than 0x10000
space = self.space
if space.isinstance_w(w_ob, space.w_unicode):
- u = space.unicode_w(w_ob)
- try:
- ordinal = wchar_helper.unicode_to_ordinal(u)
- except ValueError:
- pass
- else:
- if self.size == 2 and ordinal > 0xffff:
- raise self._convert_error("single character <= 0xFFFF",
- w_ob)
- return ordinal
+ return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
elif (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
w_ob.ctype.size == self.size):
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -91,11 +91,15 @@
from pypy.module._cffi_backend import wchar_helper
if not space.isinstance_w(w_ob, space.w_unicode):
raise self._convert_error("unicode or list or tuple", w_ob)
- s = space.unicode_w(w_ob)
- if self.ctitem.size == 2:
- n = wchar_helper.unicode_size_as_char16(s)
+ w_u = space.convert_arg_to_w_unicode(w_ob)
+ if self.size == 4:
+ n = w_u._len()
else:
- n = wchar_helper.unicode_size_as_char32(s)
+ if not w_u._has_surrogates():
+ n = w_u._len()
+ else:
+ n = wchar_helper.unicode_size_as_char16(w_u._utf8,
+ w_u._len())
if self.length >= 0 and n > self.length:
raise oefmt(space.w_IndexError,
"initializer unicode string is too long for '%s' "
@@ -328,11 +332,12 @@
length = len(s) + 1
elif space.isinstance_w(w_init, space.w_unicode):
from pypy.module._cffi_backend import wchar_helper
- u = space.unicode_w(w_init)
+ w_u = space.convert_arg_to_w_unicode(w_init)
if self.ctitem.size == 2:
- length = wchar_helper.unicode_size_as_char16(u)
+ length = wchar_helper.unicode_size_as_char16(w_u._utf8,
+ w_u._len())
else:
- length = wchar_helper.unicode_size_as_char32(u)
+ length = w_u._len()
length += 1
elif self.is_file:
result = self.prepare_file(w_init)
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,10 +1,12 @@
+from rpython.rlib import rutf8
from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
from rpython.rtyper.annlowlevel import llunicode
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
-SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
+SIZE_UNICODE = 4
if SIZE_UNICODE == 4:
@@ -48,7 +50,7 @@
self.ordinal = ordinal
def _unicode_from_wchar(ptr, length):
- return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+ return rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, ptr), length)
if SIZE_UNICODE == 2:
@@ -86,7 +88,7 @@
def unicode_from_char16(ptr, length):
# 'ptr' is a pointer to 'length' 16-bit integers
ptr = rffi.cast(rffi.USHORTP, ptr)
- u = [u'\x00'] * length
+ u = StringBuilder(length)
i = 0
j = 0
while j < length:
@@ -97,10 +99,9 @@
if 0xDC00 <= ch2 <= 0xDFFF:
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
j += 1
- u[i] = unichr(ch)
+ rutf8.unichr_as_utf8_append(u, ch)
i += 1
- del u[i:]
- return u''.join(u)
+ return u.build()
@specialize.ll()
@@ -121,23 +122,16 @@
return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
-def unicode_size_as_char16(u):
- result = len(u)
- if SIZE_UNICODE == 4:
- for i in range(result):
- if ord(u[i]) > 0xFFFF:
- result += 1
+def unicode_size_as_char16(u, len):
+ result = len
+ i = 0
+ while i < len(u):
+ code = rutf8.codepoint_at_pos(u, i)
+ if code > 0xFFFF:
+ result += 1
+ i = rutf8.next_codepoint_pos(u, i)
return result
-def unicode_size_as_char32(u):
- result = len(u)
- if SIZE_UNICODE == 2 and result > 1:
- for i in range(result - 1):
- if is_surrogate(u, i):
- result -= 1
- return result
-
-
def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
# 'target_ptr' is a raw pointer to 'target_length' wchars;
# we assume here that target_length == len(u).
More information about the pypy-commit
mailing list