[pypy-commit] pypy unicode-utf8: interpreter fixes
fijal
pypy.commits at gmail.com
Tue Nov 21 04:30:34 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93109:86548802b11b
Date: 2017-11-21 10:29 +0100
http://bitbucket.org/pypy/pypy/changeset/86548802b11b/
Log: interpreter fixes
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -231,11 +231,14 @@
return s[pt:ps]
def decode_utf8_recode(space, s, ps, end, recode_encoding):
- lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, end)
- w_v = unicodehelper.encode(space, space.newutf8(s[ps:end], lgt, flag),
+ p = ps
+ while p < end and ord(s[p]) & 0x80:
+ p += 1
+ lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+ w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag),
recode_encoding)
v = space.bytes_w(w_v)
- return v, ps
+ return v, p
def raise_app_valueerror(space, msg):
raise OperationError(space.w_ValueError, space.newtext(msg))
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -10,7 +10,7 @@
assert space.str_w(w_ret) == value
elif isinstance(value, unicode):
assert space.type(w_ret) == space.w_unicode
- assert space.unicode_w(w_ret) == value
+ assert space.utf8_w(w_ret).decode('utf8') == value
else:
assert False
@@ -102,7 +102,4 @@
def test_decode_unicode_utf8(self):
buf = parsestring.decode_unicode_utf8(self.space,
'u"\xf0\x9f\x92\x8b"', 2, 6)
- if sys.maxunicode == 65535:
- assert buf == r"\U0000d83d\U0000dc8b"
- else:
- assert buf == r"\U0001f48b"
+ assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -216,9 +216,7 @@
space = self.space
w = space.wrap
assert space.text0_w(w("123")) == "123"
- exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
- assert space.unicode0_w(w(u"123")) == u"123"
- exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004"))
+ space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
def test_getindex_w(self):
w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -351,12 +351,12 @@
try:
chr = r_uint(int(s[pos:pos+digits], 16))
except ValueError:
- aaaa
endinpos = pos
while s[endinpos] in hexdigits:
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
+ size, flag = rutf8.check_utf8(res, True)
builder.append(res)
else:
# when we get here, chr is a 32-bit unicode character
@@ -1392,7 +1392,7 @@
while pos < size:
ch = s[pos]
- c = mapping.get(ch, ERROR_CHAR)
+ c = mapping.get(ord(ch), ERROR_CHAR)
if c == ERROR_CHAR:
r, pos = errorhandler(errors, "charmap",
"character maps to <undefined>",
@@ -1407,20 +1407,17 @@
def utf8_encode_charmap(s, errors, errorhandler=None,
mapping=None):
- YYY
+ size = len(s)
if mapping is None:
- return unicode_encode_latin_1(s, size, errors,
- errorhandler=errorhandler)
-
- if errorhandler is None:
- errorhandler = default_unicode_error_encode
+ return utf8_encode_latin_1(s, size, errors,
+ errorhandler=errorhandler)
if size == 0:
return ''
result = StringBuilder(size)
pos = 0
while pos < size:
- ch = s[pos]
+ ch = rutf8.codepoint_at_pos(s, pos)
c = mapping.get(ch, '')
if len(c) == 0:
@@ -1428,9 +1425,10 @@
collend = pos + 1
while collend < size and mapping.get(s[collend], '') == '':
collend += 1
- ru, rs, pos = errorhandler(errors, "charmap",
- "character maps to <undefined>",
- s, pos, collend)
+ rs, pos = errorhandler(errors, "charmap",
+ "character maps to <undefined>",
+ s, pos, collend)
+ XXXX
if rs is not None:
# py3k only
result.append(rs)
@@ -1445,6 +1443,6 @@
result.append(c2)
continue
result.append(c)
- pos += 1
+ pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -551,10 +551,10 @@
# get the character from the mapping
if self.mapping_w is not None:
- w_ch = self.mapping_w[ord(ch)]
+ w_ch = self.mapping_w[ch]
else:
try:
- w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+ w_ch = space.getitem(self.w_mapping, space.newint(ch))
except OperationError as e:
if not e.match(space, space.w_LookupError):
raise
@@ -587,7 +587,7 @@
# get the character from the mapping
try:
- w_ch = space.getitem(self.w_mapping, space.newint(ord(ch)))
+ w_ch = space.getitem(self.w_mapping, space.newint(ch))
except OperationError as e:
if not e.match(space, space.w_LookupError):
raise
@@ -633,8 +633,8 @@
return space.newtuple([space.newutf8(result, lgt, flag),
space.newint(consumed)])
- at unwrap_spec(utf8='utf8', errors='text_or_none')
-def charmap_encode(space, utf8, errors="strict", w_mapping=None):
+ at unwrap_spec(errors='text_or_none')
+def charmap_encode(space, w_unicode, errors="strict", w_mapping=None):
from pypy.interpreter import unicodehelper
if errors is None:
@@ -645,9 +645,10 @@
mapping = Charmap_Encode(space, w_mapping)
state = space.fromcache(CodecState)
- result = unicodehelper.unicode_encode_charmap(
- utf8, errors, state.encode_error_handler, mapping)
- return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+ w_uni = unicodehelper.convert_arg_to_w_unicode(space, w_unicode)
+ result = unicodehelper.utf8_encode_charmap(
+ space.utf8_w(w_uni), errors, state.encode_error_handler, mapping)
+ return space.newtuple([space.newbytes(result), space.newint(w_uni._len())])
@unwrap_spec(chars='utf8')
More information about the pypy-commit
mailing list