[pypy-commit] pypy unicode-utf8-py3: merge unicode-utf8 into branch (way too painful)
mattip
pypy.commits at gmail.com
Wed Jan 2 09:09:48 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95568:43c80d4b68c9
Date: 2019-01-02 13:49 +0200
http://bitbucket.org/pypy/pypy/changeset/43c80d4b68c9/
Log: merge unicode-utf8 into branch (way too painful)
diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -30,7 +30,7 @@
DEALINGS IN THE SOFTWARE.
-PyPy Copyright holders 2003-2018
+PyPy Copyright holders 2003-2019
--------------------------------
Except when otherwise stated (look for LICENSE files or information at
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -4,6 +4,7 @@
* improve performance of splitlines
* fix _pypyjson to not use a wrapped dict when decoding an object
* make sure we review all the places that call ord(unichr) to check for ValueErrors
+* Find a more elegant way to define MAXUNICODE in rpython/rlib/runicode.py
* rewrite unicodeobject.unicode_to_decimal_w to only use utf8 encoded bytes
* revisit why runicode import str_decode_utf_8_impl needed instead of runicode import str_decode_utf_8
* revisit all places where we do utf8.decode('utf-8'), they should work directly with utf8
diff --git a/extra_tests/test_cPickle.py b/extra_tests/test_cPickle.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_cPickle.py
@@ -0,0 +1,34 @@
+import pytest
+import cPickle
+
+def test_stack_underflow():
+ with pytest.raises(cPickle.UnpicklingError):
+ cPickle.loads("a string")
+
+def test_bad_key():
+ with pytest.raises(cPickle.UnpicklingError) as excinfo:
+ cPickle.loads("v")
+ assert str(excinfo.value) == "invalid load key, 'v'."
+
+def test_find_global():
+ import time, cStringIO
+ entry = time.strptime('Fri Mar 27 22:20:42 2017')
+ f = cStringIO.StringIO()
+ cPickle.Pickler(f).dump(entry)
+
+ f = cStringIO.StringIO(f.getvalue())
+ e = cPickle.Unpickler(f).load()
+ assert e == entry
+
+ f = cStringIO.StringIO(f.getvalue())
+ up = cPickle.Unpickler(f)
+ up.find_global = None
+ with pytest.raises(cPickle.UnpicklingError) as e:
+ up.load()
+ assert str(e.value) == "Global and instance pickles are not supported."
+
+ f = cStringIO.StringIO(f.getvalue())
+ up = cPickle.Unpickler(f)
+ up.find_global = lambda module, name: lambda a, b: (name, a, b)
+ e = up.load()
+ assert e == ('struct_time', (2017, 3, 27, 22, 20, 42, 4, 86, -1), {})
diff --git a/extra_tests/test_cStringIO.py b/extra_tests/test_cStringIO.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_cStringIO.py
@@ -0,0 +1,23 @@
+"""
+Tests for the PyPy cStringIO implementation.
+"""
+from cStringIO import StringIO
+
+data = b"some bytes"
+
+def test_reset():
+ """
+ Test that the reset method of cStringIO objects sets the position
+ marker to the beginning of the stream.
+ """
+ stream = StringIO()
+ stream.write(data)
+ assert stream.read() == ''
+ stream.reset()
+ assert stream.read() == data
+
+ stream = StringIO(data)
+ assert stream.read() == data
+ assert stream.read() == ''
+ stream.reset()
+ assert stream.read() == data
diff --git a/extra_tests/test_string.py b/extra_tests/test_string.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_string.py
@@ -0,0 +1,46 @@
+
+"""
+Test module for functions in string.py
+"""
+import pytest
+
+def test_maketrans():
+ import string
+ assert string.maketrans('', '') == (
+ '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+ '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+ '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'
+ 'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+ '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+ '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+ '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+ '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+ '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+ '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+ '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+ assert string.maketrans('a', 'b') == (
+ '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+ '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+ '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`bbcdefghijklmnopqrstu'
+ 'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+ '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+ '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+ '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+ '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+ '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+ '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+ '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+ assert string.maketrans('ab', 'cd') == (
+ '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+ '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+ '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`cdcdefghijklmnopqrstu'
+ 'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+ '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+ '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+ '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+ '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+ '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+ '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+ '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+ with pytest.raises(ValueError):
+ string.maketrans('aa', '')
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -59,7 +59,7 @@
# General information about the project.
project = u'PyPy'
-copyright = u'2018, The PyPy Project'
+copyright = u'2019, The PyPy Project'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -74,4 +74,8 @@
Make it possible to manually manage the GC by using a combination of
gc.disable() and gc.collect_step(). Make sure to write a proper release
announcement in which we explain that existing programs could leak memory if
-they run for too much time between a gc.disable()/gc.enable()
\ No newline at end of file
+they run for too much time between a gc.disable()/gc.enable()
+
+.. branch: unicode-utf8
+
+Use utf8 internally to represent unicode
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -583,31 +583,33 @@
if num_remainingkwds == 1:
for i in range(len(keywords)):
if i not in kwds_mapping:
- name = keywords[i]
- if name is None:
- # We'll assume it's unicode. Encode it.
- # Careful, I *think* it should not be possible to
- # get an IndexError here but you never know.
- try:
- if keyword_names_w is None:
- raise IndexError
- # note: negative-based indexing from the end
- w_name = keyword_names_w[i - len(keywords)]
- except IndexError:
+ name = '?'
+ # We'll assume it's unicode. Encode it.
+ # Careful, I *think* it should not be possible to
+ # get an IndexError here but you never know.
+ try:
+ if keyword_names_w is None:
+ raise IndexError
+ # note: negative-based indexing from the end
+ w_name = keyword_names_w[i - len(keywords)]
+ except IndexError:
+ if keywords is None:
name = '?'
else:
- name = space.text_w(w_name)
+ name = keywords[i]
+ else:
+ w_enc = space.newtext(space.sys.defaultencoding)
+ w_err = space.newtext("replace")
+ w_name = space.call_method(w_name, "encode", w_enc,
+ w_err)
+ name = space.text_w(w_name)
break
self.kwd_name = name
def getmsg(self):
if self.num_kwds == 1:
- if isinstance(self.kwd_name, unicode):
- uname = unicode_encode_utf_8(self.kwd_name, len(self.kwd_name),
- 'strict', allow_surrogates=False)
- else:
- uname = self.kwd_name
- msg = "got an unexpected keyword argument '%s'" % uname
+ msg = "got an unexpected keyword argument '%s'" % (
+ self.kwd_name)
else:
msg = "got %d unexpected keyword arguments" % (
self.num_kwds)
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1605,6 +1605,8 @@
else:
assert False
+ if self.isinstance_w(w_obj, self.w_unicode):
+ return w_obj.charbuf_w(self)
def text_or_none_w(self, w_obj):
return None if self.is_none(w_obj) else self.text_w(w_obj)
diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -55,6 +55,9 @@
pass
class DummySpace(object):
+ class sys:
+ defaultencoding = 'utf-8'
+
def newtuple(self, items):
return tuple(items)
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,6 +1,10 @@
import py
import pytest
-from hypothesis import given, strategies
+try:
+ from hypothesis import given, strategies
+ HAS_HYPOTHESIS = True
+except ImportError:
+ HAS_HYPOTHESIS = False
import struct
import sys
from pypy.interpreter.unicodehelper import (
@@ -130,13 +134,6 @@
with pytest.raises(UnicodeDecodeError):
str_decode_utf_32_be(b"\x00\x00\xdc\x80", 4, None)
-
- at given(strategies.text())
-def test_utf8_encode_ascii_2(u):
- def eh(errors, encoding, reason, p, start, end):
- return "?" * (end - start), end, 'b'
- assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace")
-
def test_str_decode_ascii():
assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
def eh(errors, encoding, reason, p, start, end):
@@ -156,16 +153,6 @@
("??", "ascii", input, 5, 6),
("??", "ascii", input, 6, 7)]
- at given(strategies.text())
-def test_unicode_raw_escape(u):
- r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
- assert r == u.encode("raw-unicode-escape")
-
- at given(strategies.text())
-def test_unicode_escape(u):
- r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
- assert r == u.encode("unicode-escape")
-
def test_encode_decimal(space):
assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
with pytest.raises(ValueError):
@@ -178,3 +165,21 @@
result = uh.unicode_encode_decimal(
u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
assert result == '12ሴ'
+
+if HAS_HYPOTHESIS:
+ @given(strategies.text())
+ def test_utf8_encode_ascii_2(u):
+ def eh(errors, encoding, reason, p, start, end):
+ return "?" * (end - start), end, 'b'
+ assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace")
+
+ @given(strategies.text())
+ def test_unicode_raw_escape(u):
+ r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
+ assert r == u.encode("raw-unicode-escape")
+
+ @given(strategies.text())
+ def test_unicode_escape(u):
+ r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
+ assert r == u.encode("unicode-escape")
+
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1291,7 +1291,6 @@
allow_surrogates, "little",
'utf-16-le')
-
# ____________________________________________________________
# utf-32
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -38,29 +38,34 @@
return space.len(w_obj)
-def checkattrname(space, w_name):
+def checkattrname(space, w_name, msg):
# This is a check to ensure that getattr/setattr/delattr only pass a
- # string to the rest of the code. XXX not entirely sure if these three
+ # ascii string to the rest of the code. XXX not entirely sure if these
# functions are the only way for non-string objects to reach
# space.{get,set,del}attr()...
- # Note that if w_name is already an exact string it must be returned
- # unmodified (and not e.g. unwrapped-rewrapped).
- if not space.is_w(space.type(w_name), space.w_text):
- name = space.text_w(w_name) # typecheck
- w_name = space.newtext(name) # rewrap as a real string
+ # Note that if w_name is already an exact string it must be ascii encoded
+ if not space.isinstance_w(w_name, space.w_text):
+ try:
+ name = space.text_w(w_name) # typecheck
+ except OperationError as e:
+ if e.match(space, space.w_UnicodeError):
+ raise e
+ raise oefmt(space.w_TypeError,
+ "%s(): attribute name must be string", msg)
+ w_name = space.newtext(name)
return w_name
def delattr(space, w_object, w_name):
"""Delete a named attribute on an object.
delattr(x, 'y') is equivalent to ``del x.y''."""
- w_name = checkattrname(space, w_name)
+ w_name = checkattrname(space, w_name, 'delattr')
space.delattr(w_object, w_name)
return space.w_None
def getattr(space, w_object, w_name, w_defvalue=None):
"""Get a named attribute from an object.
getattr(x, 'y') is equivalent to ``x.y''."""
- w_name = checkattrname(space, w_name)
+ w_name = checkattrname(space, w_name, 'getattr')
try:
return space.getattr(w_object, w_name)
except OperationError as e:
@@ -72,7 +77,7 @@
def hasattr(space, w_object, w_name):
"""Return whether the object has an attribute with the given name.
(This is done by calling getattr(object, name) and catching exceptions.)"""
- w_name = checkattrname(space, w_name)
+ w_name = checkattrname(space, w_name, 'hasattr')
try:
space.getattr(w_object, w_name)
except OperationError as e:
@@ -174,7 +179,7 @@
def setattr(space, w_object, w_name, w_val):
"""Store a named attribute into an object.
setattr(x, 'y', z) is equivalent to ``x.y = z''."""
- w_name = checkattrname(space, w_name)
+ w_name = checkattrname(space, w_name, 'setattr')
space.setattr(w_object, w_name, w_val)
return space.w_None
diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -408,8 +408,9 @@
def dlopen_w(space, w_filename, flags):
if WIN32 and space.isinstance_w(w_filename, space.w_unicode):
fname = space.text_w(space.repr(w_filename))
- unicode_name = space.realunicode_w(w_filename)
- with rffi.scoped_unicode2wcharp(unicode_name) as ll_libname:
+ utf8_name = space.utf8_w(w_filename)
+ uni_len = space.len_w(w_filename)
+ with rffi.scoped_utf82wcharp(utf8_name, uni_len) as ll_libname:
try:
handle = dlopenU(ll_libname, flags)
except DLOpenError as e:
diff --git a/pypy/module/_cffi_backend/test/test_wchar_helper.py b/pypy/module/_cffi_backend/test/test_wchar_helper.py
--- a/pypy/module/_cffi_backend/test/test_wchar_helper.py
+++ b/pypy/module/_cffi_backend/test/test_wchar_helper.py
@@ -1,10 +1,15 @@
-from hypothesis import given, strategies
+try:
+ from hypothesis import given, strategies
+ HAS_HYPOTHESIS = True
+except ImportError:
+ HAS_HYPOTHESIS = False
+
from pypy.module._cffi_backend.wchar_helper import utf8_size_as_char16
-
- at given(strategies.text())
-def test_utf8_size_as_char16(u):
- assert type(u) is unicode
- length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
- assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
+if HAS_HYPOTHESIS:
+ @given(strategies.text())
+ def test_utf8_size_as_char16(u):
+ assert type(u) is unicode
+ length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
+ assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,10 +1,9 @@
import sys
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import we_are_translated, not_rpython
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib import runicode
from rpython.rlib.runicode import raw_unicode_escape_helper
-from rpython.rlib import rutf8
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -248,6 +247,7 @@
def xmlcharrefreplace_errors(space, w_exc):
+
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
w_obj = space.getattr(w_exc, space.newtext('object'))
@@ -276,6 +276,7 @@
def backslashreplace_errors(space, w_exc):
+
check_exception(space, w_exc)
if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
@@ -303,9 +304,9 @@
builder = StringBuilder()
pos = start
while pos < end:
- oc = ord(obj[pos])
+ oc = rutf8.codepoint_at_pos(obj, pos)
raw_unicode_escape_helper(builder, oc)
- pos += 1
+ pos = rutf8.next_codepoint_pos(obj, pos)
return space.newtuple([space.newtext(builder.build()), w_end])
else:
raise oefmt(space.w_TypeError,
@@ -663,6 +664,7 @@
def wrap_encoder(space, w_arg, errors="strict"):
# w_arg is a W_Unicode or W_Bytes?
w_arg = space.convert_arg_to_w_unicode(w_arg, errors)
+ w_arg = space.convert_arg_to_w_unicode(w_arg)
if errors is None:
errors = 'strict'
allow_surrogates = False
@@ -683,6 +685,7 @@
w_final=WrappedDefault(False))
def wrap_decoder(space, string, errors="strict", w_final=None):
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
@@ -743,6 +746,7 @@
w_final = WrappedDefault(False))
def utf_8_decode(space, string, errors="strict", w_final=None):
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
@@ -883,6 +887,7 @@
@unwrap_spec(string='bufferstr', errors='text_or_none')
def charmap_decode(space, string, errors="strict", w_mapping=None):
+
if errors is None:
errors = 'strict'
if len(string) == 0:
@@ -953,6 +958,7 @@
def unicode_escape_decode(space, w_string, errors="strict", w_final=None):
string = space.getarg_w('s*', w_string).as_str()
+
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
@@ -987,6 +993,7 @@
@unwrap_spec(errors='text_or_none')
def unicode_internal_decode(space, w_string, errors="strict"):
+
if errors is None:
errors = 'strict'
# special case for this codec: unicodes are returned as is
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -298,8 +298,8 @@
]
buffer = b''
- result = ""
- for (c, partialresult) in zip("\x00\xff\u07ff\u0800\uffff\U00010000".encode(encoding), check_partial):
+ result = u""
+ for (c, partialresult) in zip(u"\x00\xff\u07ff\u0800\uffff\U00010000".encode(encoding), check_partial):
buffer += bytes([c])
res = _codecs.utf_8_decode(buffer,'strict',False)
if res[1] >0 :
@@ -327,8 +327,8 @@
u"\x00\xff\u0100\uffff\U00010000",
]
buffer = b''
- result = ""
- for (c, partialresult) in zip("\x00\xff\u0100\uffff\U00010000".encode(encoding), check_partial):
+ result = u""
+ for (c, partialresult) in zip(u"\x00\xff\u0100\uffff\U00010000".encode(encoding), check_partial):
buffer += bytes([c])
res = _codecs.utf_16_decode(buffer,'strict',False)
if res[1] >0 :
@@ -630,12 +630,12 @@
def test_charmap_decode_1(self):
import codecs
- assert codecs.charmap_encode('xxx') == (b'xxx', 3)
- assert codecs.charmap_encode('xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
+ assert codecs.charmap_encode(u'xxx') == (b'xxx', 3)
+ assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
- res = codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab")
assert res == ("ab\ufffd", 3)
- res = codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe")
assert res == ('ab\ufffd', 3)
def test_decode_errors(self):
@@ -654,28 +654,28 @@
def test_errors(self):
import codecs
assert codecs.replace_errors(UnicodeEncodeError(
- "ascii", "\u3042", 0, 1, "ouch")) == ("?", 1)
+ "ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", b"\xff", 0, 1, "ouch")) == ("\ufffd", 1)
+ "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
assert codecs.replace_errors(UnicodeTranslateError(
"\u3042", 0, 1, "ouch")) == ("\ufffd", 1)
assert codecs.replace_errors(UnicodeEncodeError(
- "ascii", "\u3042\u3042", 0, 2, "ouch")) == ("??", 2)
+ "ascii", "\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", b"\xff\xff", 0, 2, "ouch")) == ("\ufffd", 2)
+ "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
assert codecs.replace_errors(UnicodeTranslateError(
"\u3042\u3042", 0, 2, "ouch")) == ("\ufffd\ufffd", 2)
class BadStartUnicodeEncodeError(UnicodeEncodeError):
def __init__(self):
- UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
self.start = []
# A UnicodeEncodeError object with a bad object attribute
class BadObjectUnicodeEncodeError(UnicodeEncodeError):
def __init__(self):
- UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
self.object = []
# A UnicodeDecodeError object without an end attribute
@@ -693,19 +693,19 @@
# A UnicodeTranslateError object without a start attribute
class NoStartUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
+ UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
del self.start
# A UnicodeTranslateError object without an end attribute
class NoEndUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
+ UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
del self.end
# A UnicodeTranslateError object without an object attribute
class NoObjectUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
+ UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
del self.object
import codecs
@@ -716,7 +716,7 @@
raises(TypeError, codecs.replace_errors, BadObjectUnicodeEncodeError())
raises(TypeError, codecs.replace_errors, BadObjectUnicodeDecodeError()
)
- # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
+ # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
def test_decode_ignore(self):
assert b'\xff'.decode('utf-7', 'ignore') == ''
@@ -724,7 +724,6 @@
def test_backslashreplace(self):
import sys
- import codecs
sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
if sys.maxunicode > 65535:
expected_ascii = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
@@ -827,7 +826,7 @@
def test_badhandler(self):
import codecs
- results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+ results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
for res in results:
@@ -856,7 +855,7 @@
import codecs
import sys
errors = 'test.badhandler_longindex'
- codecs.register_error(errors, lambda x: ('', sys.maxsize + 1))
+ codecs.register_error(errors, lambda x: (u'', sys.maxsize + 1))
# CPython raises OverflowError here
raises((IndexError, OverflowError), b'apple\x92ham\x93spam'.decode, 'utf-8', errors)
@@ -872,15 +871,15 @@
res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
if sys.maxunicode > 65535:
- assert res == "\u0000\ufffd" # UCS4 build
+ assert res == u"\u0000\ufffd" # UCS4 build
else:
- assert res == "\x00\x00\ufffd" # UCS2 build
+ assert res == u"\x00\x00\ufffd" # UCS2 build
res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
if sys.maxunicode > 65535:
- assert res == "\u0000" # UCS4 build
+ assert res == u"\u0000" # UCS4 build
else:
- assert res == "\x00\x00" # UCS2 build
+ assert res == u"\x00\x00" # UCS2 build
def handler_unicodeinternal(exc):
if not isinstance(exc, UnicodeDecodeError):
@@ -889,9 +888,9 @@
codecs.register_error("test.hui", handler_unicodeinternal)
res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
if sys.maxunicode > 65535:
- assert res == "\u0000\u0001\u0000" # UCS4 build
+ assert res == u"\u0000\u0001" # UCS4 build
else:
- assert res == "\x00\x00\x01\x00\x00" # UCS2 build
+ assert res == u"\x00\x00\x01" # UCS2 build
def handler1(exc):
if not isinstance(exc, UnicodeEncodeError) \
@@ -946,12 +945,12 @@
def test_encode_error_bad_handler(self):
import codecs
codecs.register_error("test.bad_handler", lambda e: (repl, 1))
- assert "xyz".encode("latin-1", "test.bad_handler") == b"xyz"
- repl = "\u1234"
- raises(UnicodeEncodeError, "\u5678".encode, "latin-1",
+ assert u"xyz".encode("latin-1", "test.bad_handler") == "xyz"
+ repl = u"\u1234"
+ raises(UnicodeEncodeError, u"\u5678".encode, "latin-1",
"test.bad_handler")
- repl = "\u00E9"
- s = "\u5678".encode("latin-1", "test.bad_handler")
+ repl = u"\u00E9"
+ s = u"\u5678".encode("latin-1", "test.bad_handler")
assert s == b'\xe9'
raises(UnicodeEncodeError, "\u5678".encode, "ascii",
"test.bad_handler")
@@ -993,7 +992,7 @@
charmap = dict([(c, bytes([c, c]).upper()) for c in b"abcdefgh"])
charmap[ord("?")] = b"XYZ"
import codecs
- sin = "abcDEF"
+ sin = u"abcDEF"
sout = codecs.charmap_encode(sin, "replace", charmap)[0]
assert sout == b"AABBCCXYZXYZXYZ"
@@ -1002,7 +1001,7 @@
def test_charmap_build(self):
import codecs
- assert codecs.charmap_build('123456') == {49: 0, 50: 1, 51: 2,
+ assert codecs.charmap_build(u'123456') == {49: 0, 50: 1, 51: 2,
52: 3, 53: 4, 54: 5}
def test_utf7_start_end_in_exception(self):
@@ -1013,7 +1012,7 @@
assert exc.end == 3
def test_utf7_surrogate(self):
- assert b'+3ADYAA-'.decode('utf-7') == '\udc00\ud800'
+ assert b'+3ADYAA-'.decode('utf-7') == u'\udc00\ud800'
def test_utf7_errors(self):
import codecs
@@ -1044,7 +1043,7 @@
def test_utf_16_encode_decode(self):
import codecs, sys
- x = '123abc'
+ x = u'123abc'
if sys.byteorder == 'big':
assert codecs.getencoder('utf-16')(x) == (
b'\xfe\xff\x001\x002\x003\x00a\x00b\x00c', 6)
@@ -1058,10 +1057,10 @@
def test_unicode_escape(self):
import _codecs
- assert '\\'.encode('unicode-escape') == b'\\\\'
- assert b'\\\\'.decode('unicode-escape') == '\\'
- assert '\ud801'.encode('unicode-escape') == b'\\ud801'
- assert '\u0013'.encode('unicode-escape') == b'\\x13'
+ assert u'\\'.encode('unicode-escape') == b'\\\\'
+ assert b'\\\\'.decode('unicode-escape') == u'\\'
+ assert u'\ud801'.encode('unicode-escape') == b'\\ud801'
+ assert u'\u0013'.encode('unicode-escape') == b'\\x13'
assert _codecs.unicode_escape_decode(r"\u1234") == ("\u1234", 6)
def test_mbcs(self):
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -1,6 +1,6 @@
import pytest
try:
- from hypothesis import given, strategies as st, settings
+ from hypothesis import given, strategies as st, settings, example
except ImportError:
pytest.skip("hypothesis required")
import os
@@ -63,6 +63,7 @@
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
+ at example(u'\x80', [1])
def test_readn_buffer(text, sizes):
buf = DecodeBuffer(text.encode('utf-8'))
strings = []
@@ -80,5 +81,5 @@
buf = DecodeBuffer(text.encode('utf-8'))
for i in range(len(text)):
ch = buf.next_char()
- assert ch == text[i].encode('utf-8')[0]
+ assert ch == text[i].encode('utf-8')
assert buf.exhausted()
diff --git a/pypy/module/_io/test/test_ztranslation.py b/pypy/module/_io/test/test_ztranslation.py
deleted file mode 100644
--- a/pypy/module/_io/test/test_ztranslation.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pypy.interpreter.typedef import GetSetProperty
-from pypy.module.exceptions.interp_exceptions import W_BaseException
-from pypy.objspace.fake.checkmodule import checkmodule
-
-def test_checkmodule():
- # XXX: PyTraceback usage in these methods blows up checkmodule
- def descr_gettraceback(self, space):
- return space.w_None
- def descr_settraceback(self, space, w_newtraceback):
- pass
- W_BaseException.descr_gettraceback = descr_gettraceback
- W_BaseException.descr_settraceback = descr_settraceback
- W_BaseException.typedef.add_entries(
- __traceback__=GetSetProperty(descr_gettraceback, descr_settraceback))
- checkmodule('_io')
diff --git a/pypy/module/_rawffi/structure.py b/pypy/module/_rawffi/structure.py
--- a/pypy/module/_rawffi/structure.py
+++ b/pypy/module/_rawffi/structure.py
@@ -14,7 +14,7 @@
from pypy.module._rawffi.interp_rawffi import unroll_letters_for_numbers
from pypy.module._rawffi.interp_rawffi import size_alignment
from pypy.module._rawffi.interp_rawffi import read_ptr, write_ptr
-from rpython.rlib import clibffi, rgc
+from rpython.rlib import clibffi, rgc, rutf8
from rpython.rlib.rarithmetic import intmask, signedtype, r_uint, \
r_ulonglong
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -163,6 +163,10 @@
if name in name_to_index:
raise oefmt(space.w_ValueError,
"duplicate field name %s", name)
+ try:
+ rutf8.check_ascii(name)
+ except rutf8.CheckError:
+ raise oefmt(space.w_TypeError, 'non-ascii field name')
name_to_index[name] = i
size, alignment, pos, bitsizes = size_alignment_pos(
fields, is_union, pack)
diff --git a/pypy/module/_rawffi/test/test__rawffi.py b/pypy/module/_rawffi/test/test__rawffi.py
--- a/pypy/module/_rawffi/test/test__rawffi.py
+++ b/pypy/module/_rawffi/test/test__rawffi.py
@@ -352,9 +352,10 @@
import _rawffi
A = _rawffi.Array('u')
a = A(6, 'xx\x00\x00xx')
- res = _rawffi.wcharp2unicode(a.buffer)
- assert isinstance(res, str)
- assert res == 'xx'
+ for i in (-1, 6):
+ res = _rawffi.wcharp2unicode(a.buffer, i)
+ assert isinstance(res, str)
+ assert res == u'xx'
a.free()
def test_rawstring2charp(self):
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -439,7 +439,8 @@
if len(s) % self.itemsize != 0:
raise oefmt(space.w_ValueError,
"bytes length not a multiple of item size")
- #self.check_valid_unicode(space, s) # empty for non-u arrays
+ # CPython accepts invalid unicode
+ # self.check_valid_unicode(space, s) # empty for non-u arrays
oldlen = self.len
new = len(s) / self.itemsize
if not new:
@@ -1150,11 +1151,25 @@
elif mytype.typecode == 'c':
return space.newbytes(item)
elif mytype.typecode == 'u':
- if ord(item) >= 0x110000:
- raise oefmt(space.w_ValueError,
- "array contains a unicode character out of "
- "range(0x110000)")
- return space.newtext(rutf8.unichr_as_utf8(ord(item)), 1)
+ code = r_uint(ord(item))
+ # cpython will allow values > sys.maxunicode
+ # while silently truncating the top bits
+ if code <= r_uint(0x7F):
+ # Encode ASCII
+ item = chr(code)
+ elif code <= r_uint(0x07FF):
+ item = (chr((0xc0 | (code >> 6))) +
+ chr((0x80 | (code & 0x3f))))
+ elif code <= r_uint(0xFFFF):
+ item = (chr((0xe0 | (code >> 12))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ else:
+ item = (chr((0xf0 | (code >> 18)) & 0xff) +
+ chr((0x80 | ((code >> 12) & 0x3f))) +
+ chr((0x80 | ((code >> 6) & 0x3f))) +
+ chr((0x80 | (code & 0x3f))))
+ return space.newutf8(item, 1)
assert 0, "unreachable"
# interface
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -635,6 +635,12 @@
assert a[1] == 2
assert a[2] == 3
+ def test_deepcopy(self):
+ a = self.array('u', u'\x01\u263a\x00\ufeff')
+ from copy import deepcopy
+ b = deepcopy(a)
+ assert a == b
+
def test_addmul(self):
a = self.array('i', [1, 2, 3])
assert repr(a + a) == "array('i', [1, 2, 3, 1, 2, 3])"
@@ -892,14 +898,6 @@
b.byteswap()
raises(ValueError, "a != b")
- def test_unicode_ord_positive(self):
- import sys
- if sys.maxunicode == 0xffff:
- skip("test for 32-bit unicodes")
- a = self.array('u', b'\xff\xff\xff\xff')
- assert len(a) == 1
- raises(ValueError, "a[0]")
-
def test_weakref(self):
import weakref
a = self.array('u', 'Hi!')
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -431,10 +431,17 @@
def utf8_w(self, space):
return self._value
+ def utf8_w(self, space):
+ return self._value
+
def buffer_w(self, space, flags):
space.check_buf_flags(flags, True)
return SimpleView(StringBuffer(self._value))
+ def descr_encode(self, space, w_encoding=None, w_errors=None):
+ w_uni = self.descr_decode(space, space.newtext('ascii'), space.newtext('strict'))
+ return space.call_method(w_uni, 'encode', w_encoding, w_errors)
+
def descr_getbuffer(self, space, w_flags):
#from pypy.objspace.std.bufferobject import W_Buffer
#return W_Buffer(StringBuffer(self._value))
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -11,6 +11,7 @@
from rpython.tool.sourcetools import func_with_new_name
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.unicodehelper import check_ascii_or_raise
class BaseStringFormatter(object):
@@ -498,7 +499,6 @@
else:
s = ''
if len(s) == 1:
- self.std_wp(s)
return
raise oefmt(space.w_TypeError, "%c requires int or single byte")
else:
@@ -581,7 +581,7 @@
formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
result = formatter.format()
# this can force strings, not sure if it's a problem or not
- lgt = rutf8.check_utf8(result, True)
+ lgt = rutf8.codepoints_in_utf8(result)
return space.newutf8(result, lgt)
def mod_format(space, w_format, w_values, fmt_type=FORMAT_STR):
diff --git a/pypy/objspace/std/test/test_bytesobject.py b/pypy/objspace/std/test/test_bytesobject.py
--- a/pypy/objspace/std/test/test_bytesobject.py
+++ b/pypy/objspace/std/test/test_bytesobject.py
@@ -1,7 +1,9 @@
# coding: utf-8
+import pytest
from pypy.interpreter.error import OperationError
+
class TestW_BytesObject:
def teardown_method(self, method):
@@ -637,6 +639,7 @@
def test_unicode_join_str_arg_ascii(self):
raises(TypeError, ''.join, [b'\xc3\xa1'])
+ @pytest.mark.xfail(reason='setdefaultencoding does not work?')
def test_unicode_join_endcase(self):
# This class inserts a Unicode object into its argument's natural
# iteration, in the 3rd position.
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -1,7 +1,12 @@
# -*- encoding: utf-8 -*-
import py
import sys
-from hypothesis import given, strategies, settings, example
+try:
+ from hypothesis import given, strategies, settings, example
+ HAS_HYPOTHESIS = True
+except ImportError:
+ HAS_HYPOTHESIS = False
+
from rpython.rlib import rutf8
from pypy.interpreter.error import OperationError
@@ -33,86 +38,145 @@
space.w_unicode, "__new__", space.w_unicode, w_uni)
assert w_new is w_uni
- @given(strategies.text(), strategies.integers(min_value=0, max_value=10),
- strategies.integers(min_value=-1, max_value=10))
- def test_hypo_index_find(self, u, start, len1):
- if start + len1 < 0:
- return # skip this case
- v = u[start : start + len1]
- space = self.space
- w_u = space.newutf8(u.encode('utf8'), len(u))
- w_v = space.newutf8(v.encode('utf8'), len(v))
- expected = u.find(v, start, start + len1)
- try:
- w_index = space.call_method(w_u, 'index', w_v,
+ if HAS_HYPOTHESIS:
+ @given(strategies.text(), strategies.integers(min_value=0, max_value=10),
+ strategies.integers(min_value=-1, max_value=10))
+ def test_hypo_index_find(self, u, start, len1):
+ if start + len1 < 0:
+ return # skip this case
+ v = u[start : start + len1]
+ space = self.space
+ w_u = space.newutf8(u.encode('utf8'), len(u))
+ w_v = space.newutf8(v.encode('utf8'), len(v))
+ expected = u.find(v, start, start + len1)
+ try:
+ w_index = space.call_method(w_u, 'index', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ except OperationError as e:
+ if not e.match(space, space.w_ValueError):
+ raise
+ assert expected == -1
+ else:
+ assert space.int_w(w_index) == expected >= 0
+
+ w_index = space.call_method(w_u, 'find', w_v,
space.newint(start),
space.newint(start + len1))
- except OperationError as e:
- if not e.match(space, space.w_ValueError):
- raise
- assert expected == -1
- else:
- assert space.int_w(w_index) == expected >= 0
+ assert space.int_w(w_index) == expected
+ rexpected = u.rfind(v, start, start + len1)
+ try:
+ w_index = space.call_method(w_u, 'rindex', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ except OperationError as e:
+ if not e.match(space, space.w_ValueError):
+ raise
+ assert rexpected == -1
+ else:
+ assert space.int_w(w_index) == rexpected >= 0
- w_index = space.call_method(w_u, 'find', w_v,
- space.newint(start),
- space.newint(start + len1))
- assert space.int_w(w_index) == expected
-
- rexpected = u.rfind(v, start, start + len1)
- try:
- w_index = space.call_method(w_u, 'rindex', w_v,
+ w_index = space.call_method(w_u, 'rfind', w_v,
space.newint(start),
space.newint(start + len1))
- except OperationError as e:
- if not e.match(space, space.w_ValueError):
- raise
- assert rexpected == -1
- else:
- assert space.int_w(w_index) == rexpected >= 0
+ assert space.int_w(w_index) == rexpected
- w_index = space.call_method(w_u, 'rfind', w_v,
- space.newint(start),
- space.newint(start + len1))
- assert space.int_w(w_index) == rexpected
+ expected = u.startswith(v, start)
+ w_res = space.call_method(w_u, 'startswith', w_v,
+ space.newint(start))
+ assert w_res is space.newbool(expected)
- expected = u.startswith(v, start)
- if expected and start > len(u):
- expected = False # python2 vs. python3
- w_res = space.call_method(w_u, 'startswith', w_v,
- space.newint(start))
- assert w_res is space.newbool(expected)
+ expected = u.startswith(v, start, start + len1)
+ w_res = space.call_method(w_u, 'startswith', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert w_res is space.newbool(expected)
- expected = u.startswith(v, start, start + len1)
- if expected and start > len(u):
- expected = False # python2 vs. python3
- w_res = space.call_method(w_u, 'startswith', w_v,
- space.newint(start),
- space.newint(start + len1))
- assert w_res is space.newbool(expected)
+ expected = u.endswith(v, start)
+ w_res = space.call_method(w_u, 'endswith', w_v,
+ space.newint(start))
+ assert w_res is space.newbool(expected)
- expected = u.endswith(v, start)
- if expected and start > len(u):
- expected = False # python2 vs. python3
- w_res = space.call_method(w_u, 'endswith', w_v,
- space.newint(start))
- assert w_res is space.newbool(expected)
+ expected = u.endswith(v, start, start + len1)
+ w_res = space.call_method(w_u, 'endswith', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert w_res is space.newbool(expected)
- expected = u.endswith(v, start, start + len1)
- if expected and start > len(u):
- expected = False # python2 vs. python3
- w_res = space.call_method(w_u, 'endswith', w_v,
- space.newint(start),
- space.newint(start + len1))
- assert w_res is space.newbool(expected)
- def test_text_w(self):
- space = self.space
- w_uni = space.wrap(u'abcd')
- assert space.text_w(w_uni) == 'abcd'
- w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
- # XXXX Test is from py3.5, should this still fail?
- space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni)
+ @given(u=strategies.text(),
+ start=strategies.integers(min_value=0, max_value=10),
+ len1=strategies.integers(min_value=-1, max_value=10))
+ def test_hypo_index_find(self, u, start, len1):
+ space = self.space
+ if start + len1 < 0:
+ return # skip this case
+ v = u[start : start + len1]
+ w_u = space.wrap(u)
+ w_v = space.wrap(v)
+ expected = u.find(v, start, start + len1)
+ try:
+ w_index = space.call_method(w_u, 'index', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ except OperationError as e:
+ if not e.match(space, space.w_ValueError):
+ raise
+ assert expected == -1
+ else:
+ assert space.int_w(w_index) == expected >= 0
+
+ w_index = space.call_method(w_u, 'find', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert space.int_w(w_index) == expected
+
+ rexpected = u.rfind(v, start, start + len1)
+ try:
+ w_index = space.call_method(w_u, 'rindex', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ except OperationError as e:
+ if not e.match(space, space.w_ValueError):
+ raise
+ assert rexpected == -1
+ else:
+ assert space.int_w(w_index) == rexpected >= 0
+
+ w_index = space.call_method(w_u, 'rfind', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert space.int_w(w_index) == rexpected
+
+ expected = u.startswith(v, start)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
+ w_res = space.call_method(w_u, 'startswith', w_v,
+ space.newint(start))
+ assert w_res is space.newbool(expected)
+
+ expected = u.startswith(v, start, start + len1)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
+ w_res = space.call_method(w_u, 'startswith', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert w_res is space.newbool(expected)
+
+ expected = u.endswith(v, start)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
+ w_res = space.call_method(w_u, 'endswith', w_v,
+ space.newint(start))
+ assert w_res is space.newbool(expected)
+
+ expected = u.endswith(v, start, start + len1)
+ if expected and start > len(u):
+ expected = False # python2 vs. python3
+ w_res = space.call_method(w_u, 'endswith', w_v,
+ space.newint(start),
+ space.newint(start + len1))
+ assert w_res is space.newbool(expected)
class AppTestUnicodeStringStdOnly:
@@ -853,18 +917,20 @@
def test_rfind_corner_case(self):
assert 'abc'.rfind('', 4) == -1
- def test_count(self):
- assert "".count("x") ==0
- assert "".count("") ==1
- assert "Python".count("") ==7
- assert "ab aaba".count("ab") ==2
- assert 'aaa'.count('a') == 3
- assert 'aaa'.count('b') == 0
- assert 'aaa'.count('a', -1) == 1
- assert 'aaa'.count('a', -10) == 3
- assert 'aaa'.count('a', 0, -1) == 2
- assert 'aaa'.count('a', 0, -10) == 0
- assert 'ababa'.count('aba') == 1
+ def test_count_unicode(self):
+ assert u'aaa'.count(u'', 10) == 0
+ assert u'aaa'.count(u'', 3) == 1
+ assert u"".count(u"x") ==0
+ assert u"".count(u"") ==1
+ assert u"Python".count(u"") ==7
+ assert u"ab aaba".count(u"ab") ==2
+ assert u'aaa'.count(u'a') == 3
+ assert u'aaa'.count(u'b') == 0
+ assert u'aaa'.count(u'a', -1) == 1
+ assert u'aaa'.count(u'a', -10) == 3
+ assert u'aaa'.count(u'a', 0, -1) == 2
+ assert u'aaa'.count(u'a', 0, -10) == 0
+ assert u'ababa'.count(u'aba') == 1
def test_swapcase(self):
assert '\xe4\xc4\xdf'.swapcase() == '\xc4\xe4SS'
@@ -1210,9 +1276,8 @@
assert type(str(z)) is str
assert str(z) == u'foobaz'
#
- # two completely corner cases where we differ from CPython:
- #assert unicode(encoding='supposedly_the_encoding') == u''
- #assert unicode(errors='supposedly_the_error') == u''
+ assert unicode(encoding='supposedly_the_encoding') == u''
+ assert unicode(errors='supposedly_the_error') == u''
e = raises(TypeError, str, u'', 'supposedly_the_encoding')
assert str(e.value) == 'decoding str is not supported'
e = raises(TypeError, str, u'', errors='supposedly_the_error')
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -460,6 +460,7 @@
ptr = self.start_ptr
if not self.next_char_ok(ctx, pattern, ptr, self.ppos3):
return
+ assert not isinstance(ctx, AbstractMatchContext)
self.start_ptr = ctx.next(ptr)
return self.find_first_result(ctx, pattern)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -8,6 +8,7 @@
from rpython.rlib import jit, nonconst
+# We always use MAXUNICODE = 0x10ffff when unicode objects use utf8
if 1 or rffi.sizeof(lltype.UniChar) == 4:
MAXUNICODE = 0x10ffff
allow_surrogate_by_default = False
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1046,7 +1046,7 @@
s = rutf8.Utf8StringBuilder(maxlen)
i = 0
- while i < maxlen and w[i] != '\x00':
+ while i < maxlen and ord(w[i]):
s.append_code(ord(w[i]))
i += 1
return s.build(), i
More information about the pypy-commit
mailing list