[pypy-commit] pypy unicode-utf8: general progress
fijal
pypy.commits at gmail.com
Tue Nov 21 11:19:51 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93114:cefc9ed0b4c5
Date: 2017-11-21 17:19 +0100
http://bitbucket.org/pypy/pypy/changeset/cefc9ed0b4c5/
Log: general progress
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -74,8 +74,8 @@
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
- utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr)
- w_u = space.newutf8(utf, lgt, flag)
+ lgt, flag = unicodehelper.check_utf8_or_raise(space, substr)
+ w_u = space.newutf8(substr, lgt, flag)
w_v = unicodehelper.encode(space, w_u, encoding)
return w_v
else:
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1094,9 +1094,9 @@
byteorder = BYTEORDER
pos = 0
+ index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
- pos = rutf8.next_codepoint_pos(s, pos)
if ch < 0xD800:
_STORECHAR(result, ch, byteorder)
@@ -1106,27 +1106,27 @@
elif ch >= 0xE000 or allow_surrogates:
_STORECHAR(result, ch, byteorder)
else:
- ru, pos = errorhandler(errors, public_encoding_name,
+ ru, newindex = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- xxx
- #if rs is not None:
- # # py3k only
- # if len(rs) % 2 != 0:
- # errorhandler('strict', public_encoding_name,
- # 'surrogates not allowed',
- # s, pos-1, pos)
- # result.append(rs)
- # continue
- for ch in ru:
+ for j in range(newindex - index):
+ pos = rutf8.next_codepoint_pos(s, pos)
+ j = 0
+ while j < len(ru):
+ ch = rutf8.codepoint_at_pos(ru, j)
if ord(ch) < 0xD800:
_STORECHAR(result, ord(ch), byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
+ j = rutf8.next_codepoint_pos(ru, j)
+ index = newindex
continue
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+
return result.build()
def utf8_encode_utf_16(s, errors,
@@ -1285,32 +1285,30 @@
byteorder = BYTEORDER
pos = 0
+ index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
pos = rutf8.next_codepoint_pos(s, pos)
- ch2 = 0
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- ru, pos = errorhandler(errors, public_encoding_name,
+ ru, newindex = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- XXX
- if rs is not None:
- # py3k only
- if len(rs) % 4 != 0:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- result.append(rs)
- continue
- for ch in ru:
+ for j in range(newindex - index):
+ pos = rutf8.next_codepoint_pos(s, pos)
+ j = 0
+ while j < len(ru):
+ ch = rutf8.codepoint_at_pos(ru, j)
if ord(ch) < 0xD800:
_STORECHAR32(result, ord(ch), byteorder)
else:
errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ j = rutf8.next_codepoint_pos(ru, j)
+ index = newindex
continue
_STORECHAR32(result, ch, byteorder)
+ index += 1
return result.build()
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -2,8 +2,9 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.translator.tool.cbuild import ExternalCompilationInfo
from rpython.translator import cdir
+from rpython.rlib import rutf8
-UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
+UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'.encode("utf8")
class EncodeDecodeError(Exception):
@@ -126,7 +127,7 @@
errorcb, namecb, stringdata)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
- return rffi.wcharpsize2unicode(src, length)
+ return rffi.wcharpsize2utf8(src, length)
def multibytecodec_decerror(decodebuf, e, errors,
errorcb, namecb, stringdata):
@@ -148,7 +149,7 @@
if errors == "strict":
raise EncodeDecodeError(start, end, reason)
elif errors == "ignore":
- replace = u""
+ replace = ""
elif errors == "replace":
replace = UNICODE_REPLACEMENT_CHARACTER
else:
@@ -156,8 +157,12 @@
replace, end = errorcb(errors, namecb, reason,
stringdata, start, end)
# 'replace' is RPython unicode here
- with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf:
- r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
+ lgt, _ = rutf8.check_utf8(replace, True)
+ inbuf = rffi.utf82wcharp(replace, lgt)
+ try:
+ r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
+ finally:
+ lltype.free(inbuf, flavor='raw')
if r == MBERR_NOMEMORY:
raise MemoryError
@@ -256,6 +261,7 @@
replace = "?"
else:
assert errorcb
+ XXX
retu, rets, end = errorcb(errors, namecb, reason,
unicodedata.encode("utf8"), start, end)
if rets is not None:
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -1,3 +1,6 @@
+
+from rpython.rlib import rutf8
+
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef
@@ -18,13 +21,14 @@
state = space.fromcache(CodecState)
#
try:
- u_output = c_codecs.decode(self.codec, input, errors,
+ utf8_output = c_codecs.decode(self.codec, input, errors,
state.decode_error_handler, self.name)
except c_codecs.EncodeDecodeError as e:
raise wrap_unicodedecodeerror(space, e, input, self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
- return space.newtuple([space.newunicode(u_output),
+ lgt, flag = rutf8.check_utf8(utf8_output, True)
+ return space.newtuple([space.newutf8(utf8_output, lgt, flag),
space.newint(len(input))])
@unwrap_spec(input='utf8', errors="text_or_none")
@@ -74,7 +78,7 @@
space.newtext(e.reason)]))
def wrap_unicodeencodeerror(space, e, input, inputlen, name):
- flag = 13
+ _, flag = rutf8.check_utf8(input, True)
raise OperationError(
space.w_UnicodeEncodeError,
space.newtuple([
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -841,8 +841,7 @@
prefix = "0x"
as_str = value.format(LONG_DIGITS[:base], prefix)
if self.is_unicode:
- XXX
- return as_str.decode("latin-1")
+ return rutf8.decode_latin_1(as_str)
return as_str
def _int_to_base(self, base, value):
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1009,6 +1009,29 @@
wcharp2unicoden, wcharpsize2unicode, unicode2wchararray, unicode2rawmem,
) = make_string_mappings(unicode)
+def wcharpsize2utf8(w, size):
+ """ Helper to convert WCHARP pointer to utf8 in one go.
+ Equivalent to wcharpsize2unicode().encode("utf8")
+ """
+ from rpython.rlib import rutf8
+
+ s = StringBuilder(size)
+ for i in range(size):
+ rutf8.unichr_as_utf8_append(s, ord(w[i]))
+ return s.build()
+
+def utf82wcharp(utf8, utf8len):
+ from rpython.rlib import rutf8
+
+ w = lltype.malloc(CWCHARP.TO, utf8len, flavor='raw')
+ i = 0
+ index = 0
+ while i < len(utf8):
+ w[index] = unichr(rutf8.codepoint_at_pos(utf8, i))
+ i = rutf8.next_codepoint_pos(utf8, i)
+ index += 1
+ return w
+
# char**
CCHARPP = lltype.Ptr(lltype.Array(CCHARP, hints={'nolength': True}))
diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py b/rpython/rtyper/lltypesystem/test/test_rffi.py
--- a/rpython/rtyper/lltypesystem/test/test_rffi.py
+++ b/rpython/rtyper/lltypesystem/test/test_rffi.py
@@ -590,6 +590,14 @@
res = fn(expected_extra_mallocs=range(30))
assert res == 32 * len(d)
+ def test_wcharp_to_utf8(self):
+ wchar = lltype.malloc(CWCHARP.TO, 3, flavor='raw')
+ wchar[0] = u'\u1234'
+ wchar[1] = u'\x80'
+ wchar[2] = u'a'
+ assert wcharpsize2utf8(wchar, 3).decode("utf8") == u'\u1234\x80a'
+ lltype.free(wchar, flavor='raw')
+
class TestRffiInternals:
def test_struct_create(self):
X = CStruct('xx', ('one', INT))
More information about the pypy-commit
mailing list