[pypy-commit] pypy unicode-utf8-py3: fixes from trying pypy/module/_codecs/test
mattip
pypy.commits at gmail.com
Sun Jun 17 03:35:35 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94773:bdc01ece02b0
Date: 2018-06-17 00:34 -0700
http://bitbucket.org/pypy/pypy/changeset/bdc01ece02b0/
Log: fixes from trying pypy/module/_codecs/test
diff --git a/pypy/interpreter/astcompiler/validate.py b/pypy/interpreter/astcompiler/validate.py
--- a/pypy/interpreter/astcompiler/validate.py
+++ b/pypy/interpreter/astcompiler/validate.py
@@ -409,7 +409,7 @@
def visit_Str(self, node):
space = self.space
w_type = space.type(node.s)
- if w_type != space.w_str:
+ if w_type != space.w_unicode:
raise oefmt(space.w_TypeError, "non-string type in Str")
def visit_Bytes(self, node):
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -883,7 +883,7 @@
u = s.decode('utf-8')
except UnicodeDecodeError:
return None
- return self.interned_strings.get(u) # may be None
+ return self.interned_strings.get(s) # may be None
@specialize.arg(1)
def descr_self_interp_w(self, RequiredClass, w_obj):
@@ -1718,10 +1718,10 @@
def utf8_0_w(self, w_obj):
"Like utf8_w, but rejects strings with NUL bytes."
from rpython.rlib import rstring
- result = w_obj.utf8_w(self).decode('utf8')
- if u'\x00' in result:
+ result = w_obj.utf8_w(self)
+ if '\x00' in result:
raise oefmt(self.w_ValueError,
- "argument must be a unicode string without NUL "
+ "argument must be a utf8 string without NUL "
"characters")
return rstring.assert_str0(result)
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -23,6 +23,17 @@
def decode_utf8(u):
return str_decode_utf8(u, "strict", True, None)
+def test_encode_utf8():
+ space = FakeSpace()
+ assert encode_utf8(space, u"abc") == "abc"
+ assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
+ py.test.raises(Hit, encode_utf8, space, u"\ud800")
+ py.test.raises(Hit, encode_utf8, space, u"\udc00")
+ # for the following test, go to lengths to avoid CPython's optimizer
+ # and .pyc file storage, which collapse the two surrogates into one
+ c = u"\udc00"
+ py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
+
def test_encode_utf8_allow_surrogates():
sp = FakeSpace()
assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -168,8 +168,9 @@
result, consumed = runicode.str_decode_utf_8(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
- allow_surrogates=allow_surrogates)
- return result
+ # XXX handle surrogates
+ allow_surrogates=False)
+ return len(result)
def str_decode_ascii(s, errors, final, errorhandler):
try:
@@ -1211,8 +1212,8 @@
def str_decode_utf_32(s, errors, final=True,
errorhandler=None):
- result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
- s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
+ result, c, lgt, _ = str_decode_utf_32_helper(
+ s, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
allow_surrogates=False)
return result, c, lgt
diff --git a/pypy/module/time/interp_time.py b/pypy/module/time/interp_time.py
--- a/pypy/module/time/interp_time.py
+++ b/pypy/module/time/interp_time.py
@@ -5,7 +5,7 @@
from pypy.interpreter.gateway import unwrap_spec
from pypy.interpreter.timeutils import (
SECS_TO_NS, MS_TO_NS, US_TO_NS, monotonic as _monotonic, timestamp_w)
-from pypy.interpreter.unicodehelper import decode_utf8, encode_utf8
+from pypy.interpreter.unicodehelper import str_decode_utf8
from rpython.rtyper.lltypesystem import lltype
from rpython.rlib.rarithmetic import (
intmask, r_ulonglong, r_longfloat, widen, ovfcheck, ovfcheck_float_to_int)
@@ -554,7 +554,7 @@
if HAS_TM_ZONE:
# CPython calls PyUnicode_DecodeLocale here should we do the same?
- tm_zone = decode_utf8(space, rffi.charp2str(t.c_tm_zone),
+ tm_zone = str_decode_utf8(rffi.charp2str(t.c_tm_zone),
allow_surrogates=True)
extra = [space.newtext(tm_zone),
space.newint(rffi.getintfield(t, 'c_tm_gmtoff'))]
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1183,54 +1183,43 @@
# we should implement the same shortcuts as we do for BytesDictStrategy
- def decodekey_str(self, key):
- return str_decode_utf8(key, "string", True, None, allow_surrogates=True)[0]
+ ## def setitem_str(self, w_dict, key, w_value):
+ ## assert key is not None
+ ## self.unerase(w_dict.dstorage)[key] = w_value
- def setitem_str(self, w_dict, key, w_value):
- assert key is not None
- self.unerase(w_dict.dstorage)[self.decodekey_str(key)] = w_value
+ ## def getitem(self, w_dict, w_key):
+ ## space = self.space
+ ## # -- This is called extremely often. Hack for performance --
+ ## if type(w_key) is space.StringObjectCls:
+ ## return self.getitem_str(w_dict, w_key.unwrap(space))
+ ## # -- End of performance hack --
+ ## return AbstractTypedStrategy.getitem(self, w_dict, w_key)
- def getitem(self, w_dict, w_key):
- space = self.space
- # -- This is called extremely often. Hack for performance --
- if type(w_key) is space.UnicodeObjectCls:
- return self.unerase(w_dict.dstorage).get(w_key.unwrap(space), None)
- # -- End of performance hack --
- return AbstractTypedStrategy.getitem(self, w_dict, w_key)
-
- def getitem_str(self, w_dict, key):
- assert key is not None
- return self.unerase(w_dict.dstorage).get(self.decodekey_str(key), None)
+ ## def getitem_str(self, w_dict, key):
+ ## assert key is not None
+ ## return self.unerase(w_dict.dstorage).get(key, None)
def listview_utf8(self, w_dict):
return self.unerase(w_dict.dstorage).keys()
- def w_keys(self, w_dict):
- return self.space.newlist_unicode(self.listview_unicode(w_dict))
+ ## def w_keys(self, w_dict):
+ ## return self.space.newlist_bytes(self.listview_bytes(w_dict))
def wrapkey(space, key):
return space.newutf8(key, len(key))
- @jit.look_inside_iff(lambda self, w_dict:
- w_dict_unrolling_heuristic(w_dict))
- def view_as_kwargs(self, w_dict):
- d = self.unerase(w_dict.dstorage)
- l = len(d)
- keys, values = [None] * l, [None] * l
- i = 0
- for key, val in d.iteritems():
- keys[i] = key.encode('utf-8')
- values[i] = val
- i += 1
- return keys, values
-
- def get_storage_fromkeys(self, keys_w, w_fill):
- """Return an initialized storage with keys and fill values"""
- storage = {}
- mark_dict_non_null(storage)
- for key in keys_w:
- storage[key] = w_fill
- return self.erase(storage)
+ ## @jit.look_inside_iff(lambda self, w_dict:
+ ## w_dict_unrolling_heuristic(w_dict))
+ ## def view_as_kwargs(self, w_dict):
+ ## d = self.unerase(w_dict.dstorage)
+ ## l = len(d)
+ ## keys, values = [None] * l, [None] * l
+ ## i = 0
+ ## for key, val in d.iteritems():
+ ## keys[i] = key
+ ## values[i] = val
+ ## i += 1
+ ## return keys, values
create_iterator_classes(UnicodeDictStrategy)
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -387,7 +387,7 @@
if is_unicode:
template = unicode_template_formatter(space,
space.utf8_w(w_string))
- r = template.build(args)
+ r = template.build(args, w_kwargs)
lgt = rutf8.check_utf8(r, True)
return space.newutf8(r, lgt)
else:
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -159,13 +159,7 @@
else:
return self.newint(x)
if isinstance(x, str):
- # this hack is temporary: look at the comment in
- # test_stdstdobjspace.test_wrap_string
- try:
- unicode_x = x.decode('ascii')
- except UnicodeDecodeError:
- return self._wrap_string_old(x)
- return self.newtext(unicode_x)
+ return self.newtext(x)
if isinstance(x, unicode):
x = x.encode('utf8')
lgt = rutf8.check_utf8(x, True)
@@ -390,11 +384,17 @@
@specialize.argtype(1)
def newtext(self, s):
- if isinstance(s, str):
+ if isinstance(s, unicode):
+ s, lgt = s.encode('utf8'), len(s)
+ elif isinstance(s, str):
s, lgt, chk = str_decode_utf8(s, "string", True, None,
allow_surrogates=True)
- return W_UnicodeObject(s, lgt)
- lgt = rutf8.check_utf8(s, True)
+ elif isinstance(s, tuple):
+ # result of decode_utf8
+ s, lgt, chk = s
+ else:
+ # XXX what is s ?
+ lgt = rutf8.check_utf8(s, True)
return W_UnicodeObject(s, lgt)
def newtext_or_none(self, s):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -90,7 +90,7 @@
def text_w(self, space):
try:
identifier = jit.conditional_call_elidable(
- self._utf8, g_encode_utf8, self._value)
+ self._utf8, g_encode_utf8, self._length)
except SurrogateError as e:
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext('utf-8'),
@@ -126,13 +126,15 @@
return True
@staticmethod
- def convert_arg_to_w_unicode(space, w_other):
+ def convert_arg_to_w_unicode(space, w_other, strict=None):
if isinstance(w_other, W_UnicodeObject):
return w_other
if space.isinstance_w(w_other, space.w_bytes):
return unicode_from_bytes(space, w_other)
- raise oefmt(space.w_TypeError,
- "Can't convert '%T' object to str implicitly", w_other)
+ if strict:
+ raise oefmt(space.w_TypeError,
+ "%s arg must be None, unicode or str", strict)
+ return unicode_from_encoded_object(space, w_other, None, "strict")
def convert_to_w_unicode(self, space):
return self
@@ -190,6 +192,8 @@
if w_object is None:
w_value = W_UnicodeObject.EMPTY
else:
+ if w_encoding and w_encoding._utf8 == 'unicode_internal':
+ import pdb;pdb.set_trace()
encoding, errors = _get_encoding_and_errors(space, w_encoding,
w_errors)
if encoding is None and errors is None:
@@ -292,7 +296,8 @@
def descr_eq(self, space, w_other):
try:
- res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other)._utf8
+ res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other,
+ strict='__eq__')._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -301,7 +306,8 @@
def descr_ne(self, space, w_other):
try:
- res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other)._utf8
+ res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other,
+ strict='__neq__')._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -1056,7 +1062,7 @@
def _strip(self, space, w_chars, left, right, name='strip'):
"internal function called by str_xstrip methods"
value = self._utf8
- chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+ chars = self.convert_arg_to_w_unicode(space, w_chars)._utf8
lpos = 0
rpos = len(value)
More information about the pypy-commit
mailing list