[pypy-commit] pypy utf8-unicode2: pobjspace and interpreter tests now pass
waedt
noreply at buildbot.pypy.org
Tue Jul 8 09:43:29 CEST 2014
Author: Tyler Wade <wayedt at gmail.com>
Branch: utf8-unicode2
Changeset: r72383:104602bd7dd9
Date: 2014-07-08 02:37 -0500
http://bitbucket.org/pypy/pypy/changeset/104602bd7dd9/
Log: pobjspace and interpreter tests now pass
diff too long, truncating to 2000 out of 2118 lines
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -8,6 +8,7 @@
from pypy.interpreter.pyparser.error import SyntaxError
from pypy.interpreter.astcompiler.astbuilder import ast_from_node
from pypy.interpreter.astcompiler import ast, consts
+from pypy.interpreter.utf8 import Utf8Str
class TestAstBuilder:
@@ -1103,7 +1104,7 @@
assert info.encoding == "utf-7"
s = ast_from_node(space, tree, info).body[0].value
assert isinstance(s, ast.Str)
- assert space.eq_w(s.s, space.wrap(sentence))
+ assert space.eq_w(s.s, space.wrap(Utf8Str.from_unicode(sentence)))
def test_string_bug(self):
space = self.space
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -919,11 +919,7 @@
import sys
d = {}
exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
- if sys.maxunicode > 65535 and self.maxunicode > 65535:
- expected_length = 1
- else:
- expected_length = 2
- assert len(d['u']) == expected_length
+ assert len(d['u']) == 1
class TestOptimizations:
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -153,6 +153,10 @@
const = code_hook(space, const, hidden_applevel, code_hook)
if isinstance(const, unicode):
const = Utf8Str.from_unicode(const)
+ if isinstance(const, tuple):
+ const = tuple(x if not isinstance(x, unicode)
+ else Utf8Str.from_unicode(x)
+ for x in const)
newconsts_w[num] = space.wrap(const)
num += 1
# stick the underlying CPython magic value, if the code object
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -102,7 +102,4 @@
def test_decode_unicode_utf8(self):
buf = parsestring.decode_unicode_utf8(self.space,
'u"\xf0\x9f\x92\x8b"', 2, 6)
- if sys.maxunicode == 65535:
- assert buf == r"\U0000d83d\U0000dc8b"
- else:
- assert buf == r"\U0001f48b"
+ assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -4,6 +4,7 @@
from pypy.interpreter import gateway, argument
from pypy.interpreter.gateway import ObjSpace, W_Root, WrappedDefault
from pypy.interpreter.signature import Signature
+from pypy.interpreter.utf8 import Utf8Str
import py
import sys
@@ -519,7 +520,7 @@
unicode])
w_app_g3_u = space.wrap(app_g3_u)
assert self.space.eq_w(
- space.call_function(w_app_g3_u, w(u"foo")),
+ space.call_function(w_app_g3_u, w(Utf8Str("foo"))),
w(3))
assert self.space.eq_w(
space.call_function(w_app_g3_u, w("baz")),
diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -2,6 +2,7 @@
from pypy.interpreter.error import OperationError
from pypy.interpreter.function import Function
from pypy.interpreter.pycode import PyCode
+from pypy.interpreter.utf8 import Utf8Str
from rpython.rlib.rarithmetic import r_longlong, r_ulonglong
import sys
@@ -217,8 +218,9 @@
w = space.wrap
assert space.str0_w(w("123")) == "123"
exc = space.raises_w(space.w_TypeError, space.str0_w, w("123\x004"))
- assert space.unicode0_w(w(u"123")) == u"123"
- exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004"))
+ assert space.unicode0_w(w(Utf8Str("123"))) == u"123"
+ exc = space.raises_w(space.w_TypeError, space.unicode0_w,
+ w(Utf8Str.from_unicode(u"123\x004")))
def test_getindex_w(self):
w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/test/test_utf8.py b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -35,13 +35,15 @@
iter.move(i)
if i != 4:
assert iter.peek_next() == [0x41, 0x10F, 0x20AC, 0x1F63D][i]
- assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
+ l = list(iter)
+ assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][i:]
for i in range(1, 5):
iter = s.codepoint_iter()
list(iter) # move the iterator to the end
iter.move(-i)
- assert list(iter) == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
+ l = list(iter)
+ assert l == [0x41, 0x10F, 0x20AC, 0x1F63D][4-i:]
iter = s.char_iter()
l = [s.bytes.decode('utf8') for s in list(iter)]
@@ -50,6 +52,27 @@
else:
assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
+def test_reverse_iterator():
+ s = build_utf8str()
+ iter = s.reverse_codepoint_iter()
+ assert iter.peek_next() == 0x1F63D
+ assert list(iter) == [0x1F63D, 0x20AC, 0x10F, 0x41]
+
+ for i in range(1, 5):
+ iter = s.reverse_codepoint_iter()
+ iter.move(i)
+ if i != 4:
+ assert iter.peek_next() == [0x1F63D, 0x20AC, 0x10F, 0x41][i]
+ l = list(iter)
+ assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][i:]
+
+ for i in range(1, 5):
+ iter = s.reverse_codepoint_iter()
+ list(iter) # move the iterator to the end
+ iter.move(-i)
+ l = list(iter)
+ assert l == [0x1F63D, 0x20AC, 0x10F, 0x41][4-i:]
+
def test_builder_append_slice():
builder = Utf8Builder()
builder.append_slice(Utf8Str.from_unicode(u"0ê0"), 1, 2)
@@ -57,6 +80,10 @@
assert builder.build() == u"êes"
+def test_eq():
+ assert Utf8Str('test') == Utf8Str('test')
+ assert Utf8Str('test') != Utf8Str('test1')
+
def test_unicode_literal_comparison():
builder = Utf8Builder()
builder.append(0x10F)
@@ -152,5 +179,17 @@
assert s.split() == u.split()
assert s.split(' ') == u.split(' ')
- assert s.split(maxsplit=1) == u.split(None, 1)
+ assert s.split(maxsplit=2) == u.split(None, 2)
+ assert s.split(' ', 2) == u.split(' ', 2)
assert s.split('\n') == [s]
+
+def test_rsplit():
+ # U+00A0 is a non-breaking space
+ u = u"one two three\xA0four"
+ s = Utf8Str.from_unicode(u)
+
+ assert s.rsplit() == u.rsplit()
+ assert s.rsplit(' ') == u.rsplit(' ')
+ assert s.rsplit(maxsplit=2) == u.rsplit(None, 2)
+ assert s.rsplit(' ', 2) == u.rsplit(' ', 2)
+ assert s.rsplit('\n') == [s]
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -104,6 +104,9 @@
return Utf8Str('')
# TODO: If start > _len or stop >= _len, then raise exception
+ if stop > len(self):
+ stop = len(self)
+
if self._is_ascii:
return Utf8Str(self.bytes[start:stop], True)
@@ -124,6 +127,12 @@
return Utf8Str(self.bytes[start_byte:stop_byte], is_ascii,
stop - start)
+ def byte_slice(self, start, end):
+ return Utf8Str(self.bytes[start:end], self._is_ascii)
+
+ def __repr__(self):
+ return "<Utf8Str: %r>" % unicode(self)
+
def __add__(self, other):
return Utf8Str(self.bytes + other.bytes,
self._is_ascii and other._is_ascii)
@@ -134,6 +143,9 @@
def __len__(self):
return self._len
+ def __hash__(self):
+ return hash(self.bytes)
+
def __eq__(self, other):
"""NOT_RPYTHON"""
if isinstance(other, Utf8Str):
@@ -143,6 +155,27 @@
return False
+ def __ne__(self, other):
+ """NOT_RPYTHON"""
+ if isinstance(other, Utf8Str):
+ return self.bytes != other.bytes
+ if isinstance(other, unicode):
+ return unicode(self.bytes, 'utf8') != other
+
+ return True
+
+ def __lt__(self, other):
+ return self.bytes < other.bytes
+
+ def __le__(self, other):
+ return self.bytes <= other.bytes
+
+ def __gt__(self, other):
+ return self.bytes > other.bytes
+
+ def __ge__(self, other):
+ return self.bytes >= other.bytes
+
@specialize.argtype(1)
def __contains__(self, other):
if isinstance(other, Utf8Str):
@@ -158,11 +191,20 @@
def __iter__(self):
return self.char_iter()
+ def __unicode__(self):
+ return unicode(self.bytes, 'utf8')
+
def char_iter(self):
- return Utf8StrCharIterator(self)
+ return Utf8CharacterIter(self)
+
+ def reverse_char_iter(self):
+ return Utf8ReverseCharacterIter(self)
def codepoint_iter(self):
- return Utf8StrCodePointIterator(self)
+ return Utf8CodePointIter(self)
+
+ def reverse_codepoint_iter(self):
+ return Utf8ReverseCodePointIter(self)
@specialize.argtype(1, 2)
def _bound_check(self, start, end):
@@ -270,12 +312,11 @@
else:
break
- iter.prev_count(1)
start_byte = iter.byte_pos
- iter.next_count(1)
if maxsplit == 0:
- res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+ res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
+ self._is_ascii))
break
for cd in iter:
@@ -283,12 +324,12 @@
break
else:
# Hit the end of the string
- res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)]))
+ res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
+ self._is_ascii))
break
- iter.prev_count(1)
- res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos]))
- iter.next_count(1)
+ res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos],
+ self._is_ascii))
maxsplit -= 1
return res
@@ -302,15 +343,54 @@
other_bytes = other.bytes
return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, maxsplit)]
- # TODO: I need to make a reverse_codepoint_iter first
+ res = []
+ iter = self.reverse_codepoint_iter()
+ while True:
+ # Find the start of the next word
+ for cd in iter:
+ if not unicodedb.isspace(cd):
+ break
+ else:
+ break
+ start_byte = self.next_char(iter.byte_pos)
+
+ if maxsplit == 0:
+ res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
+ break
+
+ # Find the end of the word
+ for cd in iter:
+ if unicodedb.isspace(cd):
+ break
+ else:
+ # We hit the end of the string
+ res.append(Utf8Str(self.bytes[0:start_byte], self._is_ascii))
+ break
+
+ end_byte = self.next_char(iter.byte_pos)
+ res.append(Utf8Str(self.bytes[end_byte:start_byte],
+ self._is_ascii))
+ maxsplit -= 1
+
+ res.reverse()
+ return res
+
+ @specialize.argtype(1)
def join(self, other):
if len(other) == 0:
return Utf8Str('')
- assert isinstance(other[0], Utf8Str)
- return Utf8Str(self.bytes.join([s.bytes for s in other]),
- self._is_ascii and all(s._is_ascii for s in other))
+ if isinstance(other[0], Utf8Str):
+ return Utf8Str(
+ self.bytes.join([s.bytes for s in other]),
+ self._is_ascii and all(s._is_ascii for s in other)
+ )
+ else:
+ return Utf8Str(
+ self.bytes.join([s for s in other]),
+ self._is_ascii and all(s._is_ascii for s in other)
+ )
def as_unicode(self):
"""NOT_RPYTHON"""
@@ -321,83 +401,18 @@
"""NOT_RPYTHON"""
return Utf8Str(u.encode('utf-8'))
-class Utf8StrCodePointIterator(object):
- def __init__(self, ustr):
- self.ustr = ustr
- self.pos = 0
- self.byte_pos = 0
+ def next_char(self, byte_pos):
+ return byte_pos + utf8_code_length[ord(self.bytes[byte_pos])]
- if len(ustr) != 0:
- self.current = utf8ord_bytes(ustr.bytes, 0)
- else:
- self.current = -1
+ def prev_char(self, byte_pos):
+ if byte_pos == 0:
+ return -1
+ byte_pos -= 1
+ while utf8_code_length[ord(self.bytes[byte_pos])] == 0:
+ byte_pos -= 1
+ return byte_pos
- def __iter__(self):
- return self
- def next(self):
- if self.pos == len(self.ustr):
- raise StopIteration()
- self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
- self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
- self.pos += 1
-
- return self.current
-
- def next_count(self, count=1):
- self.pos += count
- while count > 1:
- self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
- count -= 1
- self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
- self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
-
- def prev_count(self, count=1):
- self.pos -= count
- while count > 0:
- self.byte_pos -= 1
- while utf8_code_length[ord(self.ustr.bytes[self.byte_pos])] == 0:
- self.byte_pos -= 1
- count -= 1
-
- self.current = utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
- def move(self, count):
- if count > 0:
- self.next_count(count)
- elif count < 0:
- self.prev_count(-count)
-
- def peek_next(self):
- return utf8ord_bytes(self.ustr.bytes, self.byte_pos)
-
-class Utf8StrCharIterator(object):
- def __init__(self, ustr):
- self.ustr = ustr
- self.byte_pos = 0
- self.current = self._get_current()
-
- def __iter__(self):
- return self
-
- def _get_current(self):
- if self.byte_pos == len(self.ustr.bytes):
- return None
- length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
- return Utf8Str(''.join([self.ustr.bytes[i]
- for i in range(self.byte_pos, self.byte_pos + length)]),
- length == 1)
-
- def next(self):
- #import pdb; pdb.set_trace()
- ret = self.current
- if ret is None:
- raise StopIteration()
-
- self.byte_pos += utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
- self.current = self._get_current()
- return ret
class Utf8Builder(object):
@specialize.argtype(1)
@@ -452,9 +467,168 @@
raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
type(s))
+ @specialize.argtype(1)
def append_multiple_char(self, c, count):
- self._builder.append_multiple_char(c, count)
+ # TODO: What do I do when I have an int? Is it fine to just loop over
+ # .append(c) then? Should (can) I force a resize first?
+ if isinstance(c, int):
+ self._builder.append_multiple_char(chr(c), count)
+ return
+
+ if len(c) > 1:
+ import pdb; pdb.set_trace()
+ if isinstance(c, str):
+ self._builder.append_multiple_char(c, count)
+ else:
+ self._builder.append_multiple_char(c.bytes, count)
def build(self):
return Utf8Str(self._builder.build(), self._is_ascii)
+# _______________________________________________
+
+# iter.current is the current (ie the last returned) element
+# iter.pos isthe position of the current element
+# iter.byte_pos isthe byte position of the current element
+# In the before-the-start state, for foward iterators iter.pos and
+# iter.byte_pos are -1. For reverse iterators, they are len(ustr) and
+# len(ustr.bytes) respectively.
+
+class ForwardIterBase(object):
+ def __init__(self, ustr):
+ self.ustr = ustr
+ self.pos = -1
+
+ self._byte_pos = 0
+ self.byte_pos = -1
+ self.current = self._default
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self.pos + 1 == len(self.ustr):
+ raise StopIteration()
+
+ self.pos += 1
+ self.byte_pos = self._byte_pos
+
+ self.current = self._value(self.byte_pos)
+
+ self._byte_pos = self.ustr.next_char(self._byte_pos)
+ return self.current
+
+ def peek_next(self):
+ return self._value(self._byte_pos)
+
+ def peek_prev(self):
+ return self._value(self._move_backward(self.byte_pos))
+
+ def move(self, count):
+ if count > 0:
+ self.pos += count
+
+ while count != 1:
+ self._byte_pos = self.ustr.next_char(self._byte_pos)
+ count -= 1
+ self.byte_pos = self._byte_pos
+ self._byte_pos = self.ustr.next_char(self._byte_pos)
+ self.current = self._value(self.byte_pos)
+
+ elif count < 0:
+ self.pos += count
+ while count < -1:
+ self.byte_pos = self.ustr.prev_char(self.byte_pos)
+ count += 1
+ self._byte_pos = self.byte_pos
+ self.byte_pos = self.ustr.prev_char(self.byte_pos)
+ self.current = self._value(self.byte_pos)
+
+ def copy(self):
+ iter = self.__class__(self.ustr)
+ iter.pos = self.pos
+ iter.byte_pos = self.byte_pos
+ iter._byte_pos = self._byte_pos
+ iter.current = self.current
+ return iter
+
+class ReverseIterBase(object):
+ def __init__(self, ustr):
+ self.ustr = ustr
+ self.pos = len(ustr)
+ self.byte_pos = len(ustr.bytes)
+ self.current = self._default
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self.pos == 0:
+ raise StopIteration()
+
+ self.pos -= 1
+ self.byte_pos = self.ustr.prev_char(self.byte_pos)
+ self.current = self._value(self.byte_pos)
+ return self.current
+
+ def peek_next(self):
+ return self._value(self.ustr.prev_char(self.byte_pos))
+
+ def peek_prev(self):
+ return self._value(self.ustr.next_char(self.byte_pos))
+
+ def move(self, count):
+ if count > 0:
+ self.pos -= count
+ while count != 0:
+ self.byte_pos = self.ustr.prev_char(self.byte_pos)
+ count -= 1
+ self.current = self._value(self.byte_pos)
+ elif count < 0:
+ self.pos -= count
+ while count != 0:
+ self.byte_pos = self.ustr.next_char(self.byte_pos)
+ count += 1
+ self.current = self._value(self.byte_pos)
+
+ def copy(self):
+ iter = self.__class__(self.ustr)
+ iter.pos = self.pos
+ iter.byte_pos = self.byte_pos
+ iter.current = self.current
+ return iter
+
+def make_iterator(name, base, calc_value, default):
+ class C(base):
+ _default = default
+ _value = calc_value
+ C.__name__ = name
+ return C
+
+def codepoint_calc_value(self, byte_pos):
+ if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+ return -1
+ return utf8ord_bytes(self.ustr.bytes, byte_pos)
+
+def character_calc_value(self, byte_pos):
+ if byte_pos == -1 or byte_pos == len(self.ustr.bytes):
+ return None
+ length = utf8_code_length[ord(self.ustr.bytes[self.byte_pos])]
+ return Utf8Str(''.join([self.ustr.bytes[i]
+ for i in range(self.byte_pos, self.byte_pos + length)]),
+ length == 1)
+
+Utf8CodePointIter = make_iterator("Utf8CodePointIter", ForwardIterBase,
+ codepoint_calc_value, -1)
+Utf8CharacterIter = make_iterator("Utf8CharacterIter", ForwardIterBase,
+ character_calc_value, None)
+Utf8ReverseCodePointIter = make_iterator(
+ "Utf8ReverseCodePointIter", ReverseIterBase, codepoint_calc_value, -1)
+Utf8ReverseCharacterIter = make_iterator(
+ "Utf8ReverseCharacterIter", ReverseIterBase, character_calc_value, None)
+
+del make_iterator
+del codepoint_calc_value
+del character_calc_value
+del ForwardIterBase
+del ReverseIterBase
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -208,7 +208,6 @@
pos = 0
while pos < size:
- #oc = ORD(s, pos)
oc = utf8ord(s, pos)
# Escape quotes
@@ -460,10 +459,10 @@
else:
return s.bytes
- iter.move(-1)
result = Utf8Builder(len(s.bytes))
result.append_slice(s.bytes, 0, iter.byte_pos)
+ iter.move(-1)
for oc in iter:
if oc >= 0xD800 and oc <= 0xDFFF:
# Check the next character to see if this is a surrogate pair
@@ -741,7 +740,6 @@
result = Utf8Builder(size // 2)
- #XXX I think the errors are not correctly handled here
while pos < size:
# remaining bytes at the end? (size should be even)
if len(s) - pos < 2:
@@ -869,7 +867,8 @@
def str_decode_utf_32_helper(s, size, errors, final=True,
errorhandler=None,
- byteorder="native"):
+ byteorder="native",
+ encodingname='utf32'):
if errorhandler is None:
errorhandler = default_unicode_error_decode
bo = 0
@@ -924,7 +923,7 @@
if len(s) - pos < 4:
if not final:
break
- r, pos = errorhandler(errors, 'utf32', "truncated data",
+ r, pos = errorhandler(errors, encodingname, "truncated data",
s, pos, len(s))
result.append(r)
if len(s) - pos < 4:
@@ -933,7 +932,8 @@
ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
(ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
if ch >= 0x110000:
- r, pos = errorhandler(errors, 'utf32', "codepoint not in range(0x110000)",
+ r, pos = errorhandler(errors, encodingname,
+ "codepoint not in range(0x110000)",
s, pos, len(s))
result.append(r)
continue
@@ -1097,7 +1097,7 @@
if errorhandler is None:
errorhandler = default_unicode_error_decode
if size == 0:
- return u'', 0
+ return Utf8Str(''), 0
inShift = False
base64bits = 0
@@ -1345,9 +1345,12 @@
def str_decode_unicode_internal(s, size, errors, final=False,
errorhandler=None):
if BYTEORDER == 'little':
- return str_decode_utf_32_le(s, size, errors, errorhandler)
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "little", "unicode_internal")
else:
- return str_decode_utf_32_be(s, size, errors, errorhandler)
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "internal", "unicode_internal")
+ return result, length
def unicode_encode_unicode_internal(s, size, errors, errorhandler=None):
if BYTEORDER == 'little':
@@ -1561,6 +1564,7 @@
def default_unicode_error_decode(errors, encoding, msg, s,
startingpos, endingpos):
+ """NOT_RPYTHON"""
if errors == 'replace':
return _unicode_error_replacement, endingpos
if errors == 'ignore':
@@ -1570,9 +1574,10 @@
def default_unicode_error_encode(errors, encoding, msg, u,
startingpos, endingpos):
+ """NOT_RPYTHON"""
if errors == 'replace':
return '?', None, endingpos
if errors == 'ignore':
return '', None, endingpos
- raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
+ raise UnicodeEncodeError(encoding, unicode(u), startingpos, endingpos, msg)
diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -9,6 +9,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import WrappedDefault, interp2app, unwrap_spec
from pypy.interpreter.signature import Signature
+from pypy.interpreter.utf8_codecs import str_decode_latin_1
from pypy.objspace.std.sliceobject import W_SliceObject
from pypy.objspace.std.stdtypedef import StdTypeDef
from pypy.objspace.std.stringmethods import StringMethods, _get_buffer
@@ -154,9 +155,11 @@
w_dict = self.getdict(space)
if w_dict is None:
w_dict = space.w_None
+ ustr = str_decode_latin_1(''.join(self.data), len(self.data),
+ 'strict')[0]
return space.newtuple([
space.type(self), space.newtuple([
- space.wrap(''.join(self.data).decode('latin-1')),
+ space.wrap(ustr),
space.wrap('latin-1')]),
w_dict])
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,7 +4,9 @@
import string
from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, ORD, utf8chr
+from pypy.interpreter.utf8_codecs import (
+ unicode_encode_latin_1, unicode_encode_ascii, str_decode_ascii)
from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rfloat import copysign, formatd
@@ -20,7 +22,7 @@
result = 0
i = start
while i < end:
- digit = ord(s[i]) - ord('0')
+ digit = ORD(s, i) - ord('0')
if 0 <= digit <= 9:
if result > (sys.maxint - digit) / 10:
raise oefmt(space.w_ValueError,
@@ -63,22 +65,24 @@
out = Utf8Builder()
else:
out = rstring.StringBuilder()
+
if not level:
raise OperationError(space.w_ValueError,
space.wrap("Recursion depth exceeded"))
level -= 1
- s = self.template
- return self._do_build_string(start, end, level, out, s)
+ return self._do_build_string(start, end, level, out, self.template)
@jit.look_inside_iff(lambda self, start, end, level, out, s: jit.isconstant(s))
def _do_build_string(self, start, end, level, out, s):
space = self.space
last_literal = i = start
+
while i < end:
c = ORD(s, i)
i += 1
if c == ord("{") or c == ord("}"):
at_end = i == end
+
# Find escaped "{" and "}"
markup_follows = True
if c == ord("}"):
@@ -87,6 +91,7 @@
space.wrap("Single '}'"))
i += 1
markup_follows = False
+
if c == ord("{"):
if at_end:
raise OperationError(space.w_ValueError,
@@ -94,6 +99,7 @@
if ORD(s, i) == ord("{"):
i += 1
markup_follows = False
+
# Attach literal data, ending with { or }
out.append_slice(s, last_literal, i - 1)
if not markup_follows:
@@ -101,6 +107,7 @@
end_literal = i - 1
assert end_literal > last_literal
literal = self.template[last_literal:end_literal]
+
w_entry = space.newtuple([
space.wrap(literal),
space.w_None, space.w_None, space.w_None])
@@ -108,6 +115,7 @@
self.last_end = i
last_literal = i
continue
+
nested = 1
field_start = i
recursive = False
@@ -121,6 +129,7 @@
if not nested:
break
i += 1
+
if nested:
raise OperationError(space.w_ValueError,
space.wrap("Unmatched '{'"))
@@ -139,41 +148,43 @@
# Find ":" or "!"
i = start
while i < end:
- c = s[i]
+ c = ORD(s, i)
if c == ord(":") or c == ord("!"):
end_name = i
+
if c == ord("!"):
i += 1
if i == end:
w_msg = self.space.wrap("expected conversion")
raise OperationError(self.space.w_ValueError, w_msg)
- conversion = s[i]
+
+ conversion = ORD(s, i)
i += 1
if i < end:
- if s[i] != ':':
+ if ORD(s, i) != ord(':'):
w_msg = self.space.wrap("expected ':' after"
" format specifier")
raise OperationError(self.space.w_ValueError,
w_msg)
i += 1
else:
- conversion = None
+ conversion = -1
i += 1
return s[start:end_name], conversion, i
i += 1
- return s[start:end], None, end
+ return s[start:end], -1, end
@jit.unroll_safe
def _get_argument(self, name):
# First, find the argument.
space = self.space
i = 0
- end = len(name)
- while i < end:
- c = name[i]
+ while i < len(name):
+ c = ORD(name, i)
if c == ord("[") or c == ord("."):
break
i += 1
+
empty = not i
if empty:
index = -1
@@ -181,12 +192,14 @@
index, stop = _parse_int(self.space, name, 0, i)
if stop != i:
index = -1
+
use_numeric = empty or index != -1
if self.auto_numbering_state == ANS_INIT and use_numeric:
if empty:
self.auto_numbering_state = ANS_AUTO
else:
self.auto_numbering_state = ANS_MANUAL
+
if use_numeric:
if self.auto_numbering_state == ANS_MANUAL:
if empty:
@@ -204,7 +217,8 @@
kwarg = name[:i]
if self.is_unicode:
try:
- arg_key = kwarg.encode("latin-1")
+ arg_key = unicode_encode_latin_1(kwarg, len(kwarg),
+ 'strict')
except UnicodeEncodeError:
# Not going to be found in a dict of strings.
raise OperationError(space.w_KeyError, space.wrap(kwarg))
@@ -220,7 +234,7 @@
except IndexError:
w_msg = space.wrap("index out of range")
raise OperationError(space.w_IndexError, w_msg)
- return self._resolve_lookups(w_arg, name, i, end)
+ return self._resolve_lookups(w_arg, name, i, len(name))
@jit.unroll_safe
def _resolve_lookups(self, w_obj, name, start, end):
@@ -228,15 +242,16 @@
space = self.space
i = start
while i < end:
- c = name[i]
+ c = ORD(name, i)
if c == ord("."):
i += 1
start = i
while i < end:
- c = name[i]
+ c = ORD(name, i)
if c == ord("[") or c == ord("."):
break
i += 1
+
if start == i:
w_msg = space.wrap("Empty attribute in format string")
raise OperationError(space.w_ValueError, w_msg)
@@ -247,18 +262,17 @@
self.parser_list_w.append(space.newtuple([
space.w_True, w_attr]))
elif c == ord("["):
- got_bracket = False
i += 1
start = i
while i < end:
- c = name[i]
+ c = ORD(name, i)
if c == ord("]"):
- got_bracket = True
break
i += 1
- if not got_bracket:
+ else:
raise OperationError(space.w_ValueError,
space.wrap("Missing ']'"))
+
index, reached = _parse_int(self.space, name, start, i)
if index != -1 and reached == i:
w_item = space.wrap(index)
@@ -285,29 +299,30 @@
if c == ord("[") or c == ord("."):
break
i += 1
+
if i == 0:
index = -1
else:
index, stop = _parse_int(self.space, name, 0, i)
if stop != i:
index = -1
+
if index >= 0:
w_first = space.wrap(index)
else:
w_first = space.wrap(name[:i])
- #
+
self.parser_list_w = []
self._resolve_lookups(None, name, i, end)
- #
+
return space.newtuple([w_first,
space.iter(space.newlist(self.parser_list_w))])
def _convert(self, w_obj, conversion):
space = self.space
- conv = ORD(conversion, 0)
- if conv == ord("r"):
+ if conversion == ord("r"):
return space.repr(w_obj)
- elif conv == ord("s"):
+ elif conversion == ord("s"):
if self.is_unicode:
return space.call_function(space.w_unicode, w_obj)
return space.str(w_obj)
@@ -318,7 +333,7 @@
def _render_field(self, start, end, recursive, level):
name, conversion, spec_start = self._parse_field(start, end)
spec = self.template[spec_start:end]
- #
+
if self.parser_list_w is not None:
# used from formatter_parser()
if level == 1: # ignore recursive calls
@@ -333,12 +348,13 @@
self.parser_list_w.append(w_entry)
self.last_end = end + 1
return self.empty
- #
+
w_obj = self._get_argument(name)
- if conversion is not None:
+ if conversion != -1:
w_obj = self._convert(w_obj, conversion)
if recursive:
spec = self._build_string(spec_start, end, level)
+
w_rendered = self.space.format(w_obj, self.space.wrap(spec))
unwrapper = "unicode_w" if self.is_unicode else "str_w"
to_interp = getattr(self.space, unwrapper)
@@ -348,7 +364,7 @@
self.parser_list_w = []
self.last_end = 0
self._build_string(0, len(self.template), 2)
- #
+
space = self.space
if self.last_end < len(self.template):
w_lastentry = space.newtuple([
@@ -413,7 +429,7 @@
def __init__(self, space, is_unicode, spec):
self.space = space
self.is_unicode = is_unicode
- self.empty = u"" if is_unicode else ""
+ self.empty = Utf8Str("") if is_unicode else ""
self.spec = spec
def _is_alignment(self, c):
@@ -429,78 +445,76 @@
def _parse_spec(self, default_type, default_align):
space = self.space
- self._fill_char = self._lit("\0")[0]
- self._align = default_align
+ self._fill_char = ord("\0")
+
+ self._align = ord(default_align)
self._alternate = False
- self._sign = "\0"
+ self._sign = ord("\0")
self._thousands_sep = False
self._precision = -1
- the_type = default_type
+
spec = self.spec
if not spec:
return True
+
length = len(spec)
i = 0
got_align = True
- if length - i >= 2 and self._is_alignment(spec[i + 1]):
- self._align = spec[i + 1]
- self._fill_char = spec[i]
+
+ if length - i >= 2 and self._is_alignment(ORD(spec, i + 1)):
+ self._align = ORD(spec, i + 1)
+ self._fill_char = ORD(spec, i)
i += 2
- elif length - i >= 1 and self._is_alignment(spec[i]):
- self._align = spec[i]
+ elif length - i >= 1 and self._is_alignment(ORD(spec, i)):
+ self._align = ORD(spec, i)
i += 1
else:
got_align = False
- if length - i >= 1 and self._is_sign(spec[i]):
- self._sign = spec[i]
+
+ if length - i >= 1 and self._is_sign(ORD(spec, i)):
+ self._sign = ORD(spec, i)
i += 1
- if length - i >= 1 and spec[i] == "#":
+ if length - i >= 1 and ORD(spec, i) == ord("#"):
self._alternate = True
i += 1
- if self._fill_char == "\0" and length - i >= 1 and spec[i] == "0":
- self._fill_char = self._lit("0")[0]
+
+ if (self._fill_char == ord("\0") and length - i >= 1 and
+ ORD(spec, i) == ord("0")):
+ self._fill_char = ord("0")
if not got_align:
- self._align = "="
+ self._align = ord("=")
i += 1
+
self._width, i = _parse_int(self.space, spec, i, length)
- if length != i and spec[i] == ",":
+ if length != i and ORD(spec, i) == ord(","):
self._thousands_sep = True
i += 1
- if length != i and spec[i] == ".":
+ if length != i and ORD(spec, i) == ord("."):
i += 1
self._precision, i = _parse_int(self.space, spec, i, length)
if self._precision == -1:
raise OperationError(space.w_ValueError,
space.wrap("no precision given"))
+
if length - i > 1:
raise OperationError(space.w_ValueError,
space.wrap("invalid format spec"))
if length - i == 1:
- presentation_type = spec[i]
if self.is_unicode:
try:
- the_type = spec[i].encode("ascii")[0]
+ self._type = unicode_encode_ascii(spec[i], 1, 'strict')[0]
except UnicodeEncodeError:
raise OperationError(space.w_ValueError,
space.wrap("invalid presentation type"))
else:
- the_type = presentation_type
+ self._type = spec[i]
i += 1
- self._type = the_type
+ else:
+ self._type = default_type
+
if self._thousands_sep:
- tp = self._type
- if (tp == "d" or
- tp == "e" or
- tp == "f" or
- tp == "g" or
- tp == "E" or
- tp == "G" or
- tp == "%" or
- tp == "F" or
- tp == "\0"):
- # ok
- pass
- else:
+ if self._type not in ('d', 'e', 'f', 'g', 'E', 'G', '%', 'F',
+ '\0'):
raise OperationError(space.w_ValueError,
space.wrap("invalid type with ','"))
return False
@@ -511,12 +525,13 @@
total = self._width
else:
total = length
+
align = self._align
- if align == ">":
+ if align == ord(">"):
left = total - length
- elif align == "^":
+ elif align == ord("^"):
left = (total - length) / 2
- elif align == "<" or align == "=":
+ elif align == ord("<") or align == ord("="):
left = 0
else:
raise AssertionError("shouldn't be here")
@@ -525,22 +540,16 @@
self._right_pad = right
return total
- def _lit(self, s):
- if self.is_unicode:
- return s.decode("ascii")
- else:
- return s
-
def _pad(self, string):
builder = self._builder()
- builder.append_multiple_char(self._fill_char, self._left_pad)
+ builder.append_multiple_char(chr(self._fill_char), self._left_pad)
builder.append(string)
- builder.append_multiple_char(self._fill_char, self._right_pad)
+ builder.append_multiple_char(chr(self._fill_char), self._right_pad)
return builder.build()
def _builder(self):
if self.is_unicode:
- return rstring.UnicodeBuilder()
+ return Utf8Builder()
else:
return rstring.StringBuilder()
@@ -555,23 +564,25 @@
return space.wrap(string)
if self._type != "s":
self._unknown_presentation("string")
- if self._sign != "\0":
+ if self._sign != ord("\0"):
msg = "Sign not allowed in string format specifier"
raise OperationError(space.w_ValueError, space.wrap(msg))
if self._alternate:
msg = "Alternate form not allowed in string format specifier"
raise OperationError(space.w_ValueError, space.wrap(msg))
- if self._align == "=":
+ if self._align == ord("="):
msg = "'=' alignment not allowed in string format specifier"
raise OperationError(space.w_ValueError, space.wrap(msg))
+
length = len(string)
precision = self._precision
if precision != -1 and length >= precision:
assert precision >= 0
length = precision
string = string[:precision]
- if self._fill_char == "\0":
- self._fill_char = self._lit(" ")[0]
+
+ if self._fill_char == ord("\0"):
+ self._fill_char = ord(" ")
self._calc_padding(string, length)
return space.wrap(self._pad(string))
@@ -586,9 +597,11 @@
dec = "."
thousands = ""
grouping = "\256"
+
if self.is_unicode:
- self._loc_dec = dec.decode("ascii")
- self._loc_thousands = thousands.decode("ascii")
+ self._loc_dec = str_decode_ascii(dec, len(dec), 'strict')[0]
+ self._loc_thousands = str_decode_ascii(
+ thousands, len(thousands), 'strict')[0]
else:
self._loc_dec = dec
self._loc_thousands = thousands
@@ -617,41 +630,45 @@
spec.n_rpadding = 0
spec.n_min_width = 0
spec.n_total = 0
- spec.sign = "\0"
+ spec.sign = ord("\0")
spec.n_sign = 0
+
sign = self._sign
- if sign == "+":
+ if sign == ord("+"):
spec.n_sign = 1
- spec.sign = "-" if sign_char == "-" else "+"
- elif sign == " ":
+ spec.sign = ord("-") if sign_char == "-" else ord("+")
+ elif sign == ord(" "):
spec.n_sign = 1
- spec.sign = "-" if sign_char == "-" else " "
+ spec.sign = ord("-") if sign_char == "-" else ord(" ")
elif sign_char == "-":
spec.n_sign = 1
- spec.sign = "-"
+ spec.sign = ord("-")
extra_length = (spec.n_sign + spec.n_prefix + spec.n_decimal +
spec.n_remainder) # Not padding or digits
- if self._fill_char == "0" and self._align == "=":
+
+ if self._fill_char == ord("0") and self._align == ord("="):
spec.n_min_width = self._width - extra_length
if self._loc_thousands:
self._group_digits(spec, digits[to_number:])
n_grouped_digits = len(self._grouped_digits)
else:
n_grouped_digits = spec.n_digits
+
n_padding = self._width - (extra_length + n_grouped_digits)
if n_padding > 0:
align = self._align
- if align == "<":
+ if align == ord("<"):
spec.n_rpadding = n_padding
- elif align == ">":
+ elif align == ord(">"):
spec.n_lpadding = n_padding
- elif align == "^":
+ elif align == ord("^"):
spec.n_lpadding = n_padding // 2
spec.n_rpadding = n_padding - spec.n_lpadding
- elif align == "=":
+ elif align == ord("="):
spec.n_spadding = n_padding
else:
raise AssertionError("shouldn't reach")
+
spec.n_total = spec.n_lpadding + spec.n_sign + spec.n_prefix + \
spec.n_spadding + n_grouped_digits + \
spec.n_decimal + spec.n_remainder + spec.n_rpadding
@@ -720,21 +737,28 @@
def _fill_number(self, spec, num, to_digits, to_prefix, fill_char,
to_remainder, upper, grouped_digits=None):
out = self._builder()
+
if spec.n_lpadding:
- out.append_multiple_char(fill_char[0], spec.n_lpadding)
+ out.append_multiple_char(chr(fill_char), spec.n_lpadding)
+
if spec.n_sign:
if self.is_unicode:
- sign = spec.sign.decode("ascii")
+ # TODO: A better way to do this might be to check if
+ # spec.sign < 127 ...
+ sign = str_decode_ascii(chr(spec.sign), 1, 'strict')[0]
else:
- sign = spec.sign
+ sign = chr(spec.sign)
out.append(sign)
+
if spec.n_prefix:
pref = num[to_prefix:to_prefix + spec.n_prefix]
if upper:
pref = self._upcase_string(pref)
out.append(pref)
+
if spec.n_spadding:
- out.append_multiple_char(fill_char[0], spec.n_spadding)
+ out.append_multiple_char(chr(fill_char), spec.n_spadding)
+
if spec.n_digits != 0:
if self._loc_thousands:
if grouped_digits is not None:
@@ -749,12 +773,13 @@
if upper:
digits = self._upcase_string(digits)
out.append(digits)
+
if spec.n_decimal:
- out.append(self._lit(".")[0])
+ out.append(".")
if spec.n_remainder:
out.append(num[to_remainder:])
if spec.n_rpadding:
- out.append_multiple_char(fill_char[0], spec.n_rpadding)
+ out.append_multiple_char(chr(fill_char), spec.n_rpadding)
#if complex, need to call twice - just retun the buffer
return out.build()
@@ -764,14 +789,14 @@
msg = "precision not allowed in integer type"
raise OperationError(space.w_ValueError, space.wrap(msg))
sign_char = "\0"
- tp = self._type
- if tp == "c":
- if self._sign != "\0":
+
+ if self._type == "c":
+ if self._sign != ord("\0"):
msg = "sign not allowed with 'c' presentation type"
raise OperationError(space.w_ValueError, space.wrap(msg))
value = space.int_w(w_num)
if self.is_unicode:
- result = runicode.UNICHR(value)
+ result = utf8chr(value)
else:
result = chr(value)
n_digits = 1
@@ -781,16 +806,16 @@
to_prefix = 0
to_numeric = 0
else:
- if tp == "b":
+ if self._type == "b":
base = 2
skip_leading = 2
- elif tp == "o":
+ elif self._type == "o":
base = 8
skip_leading = 2
- elif tp == "x" or tp == "X":
+ elif self._type == "x" or self._type == "X":
base = 16
skip_leading = 2
- elif tp == "n" or tp == "d":
+ elif self._type == "n" or self._type == "d":
base = 10
skip_leading = 0
else:
@@ -801,7 +826,7 @@
result = self._long_to_base(base, space.bigint_w(w_num))
n_prefix = skip_leading if self._alternate else 0
to_prefix = 0
- if result[0] == "-":
+ if ORD(result, 0) == ord("-"):
sign_char = "-"
skip_leading += 1
to_prefix += 1
@@ -809,10 +834,10 @@
n_remainder = 0
to_remainder = 0
to_numeric = skip_leading
- self._get_locale(tp)
+ self._get_locale(self._type)
spec = self._calc_num_width(n_prefix, sign_char, to_numeric, n_digits,
n_remainder, False, result)
- fill = self._lit(" ") if self._fill_char == "\0" else self._fill_char
+ fill = ord(" ") if self._fill_char == ord("\0") else self._fill_char
upper = self._type == "X"
return self.space.wrap(self._fill_number(spec, result, to_numeric,
to_prefix, fill, to_remainder, upper))
@@ -827,14 +852,14 @@
prefix = "0x"
as_str = value.format(LONG_DIGITS[:base], prefix)
if self.is_unicode:
- return as_str.decode("ascii")
+ return str_decode_ascii(as_str, len(as_str), 'strict')[0]
return as_str
def _int_to_base(self, base, value):
if base == 10:
s = str(value)
if self.is_unicode:
- return s.decode("ascii")
+ return str_decode_ascii(s, len(s), 'strict')[0]
return s
# This part is slow.
negative = value < 0
@@ -879,22 +904,10 @@
if self.is_unicode:
return space.call_function(space.w_unicode, w_num)
return self.space.str(w_num)
- tp = self._type
- if (tp == "b" or
- tp == "c" or
- tp == "d" or
- tp == "o" or
- tp == "x" or
- tp == "X" or
- tp == "n"):
+
+ if self._type in ("b", "c", "d", "o", "x", "X", "n"):
return self._format_int_or_long(w_num, kind)
- elif (tp == "e" or
- tp == "E" or
- tp == "f" or
- tp == "F" or
- tp == "g" or
- tp == "G" or
- tp == "%"):
+ elif self._type in ("e", "E", "f", "F", "g", "G", "%"):
w_float = space.float(w_num)
return self._format_float(w_float)
else:
@@ -921,6 +934,7 @@
if self._alternate:
msg = "alternate form not allowed in float formats"
raise OperationError(space.w_ValueError, space.wrap(msg))
+
tp = self._type
self._get_locale(tp)
if tp == "\0":
@@ -929,6 +943,7 @@
flags |= rfloat.DTSF_ADD_DOT_0
elif tp == "n":
tp = "g"
+
value = space.float_w(w_float)
if tp == "%":
tp = "f"
@@ -936,6 +951,7 @@
add_pct = True
else:
add_pct = False
+
if self._precision == -1:
self._precision = default_precision
result, special = rfloat.double_to_string(value, tp,
@@ -943,22 +959,26 @@
if add_pct:
result += "%"
n_digits = len(result)
- if result[0] == "-":
+
+ if ORD(result, 0) == ord("-"):
sign = "-"
to_number = 1
n_digits -= 1
else:
sign = "\0"
to_number = 0
+
have_dec_point, to_remainder = self._parse_number(result, to_number)
n_remainder = len(result) - to_remainder
+
if self.is_unicode:
- digits = result.decode("ascii")
+ digits = str_decode_ascii(result , len(result), 'strict')[0]
else:
digits = result
+
spec = self._calc_num_width(0, sign, to_number, n_digits,
n_remainder, have_dec_point, digits)
- fill = self._lit(" ") if self._fill_char == "\0" else self._fill_char
+ fill = ord(" ") if self._fill_char == ord("\0") else self._fill_char
return self.space.wrap(self._fill_number(spec, digits, to_number, 0,
fill, to_remainder, False))
@@ -968,30 +988,23 @@
if self.is_unicode:
return space.call_function(space.w_unicode, w_float)
return space.str(w_float)
- tp = self._type
- if (tp == "\0" or
- tp == "e" or
- tp == "E" or
- tp == "f" or
- tp == "F" or
- tp == "g" or
- tp == "G" or
- tp == "n" or
- tp == "%"):
+
+ if self._type in ("\0", "e", "E", "f", "F", "g", "G", "n", "%"):
return self._format_float(w_float)
self._unknown_presentation("float")
def _format_complex(self, w_complex):
space = self.space
+
tp = self._type
self._get_locale(tp)
default_precision = 6
- if self._align == "=":
+ if self._align == ord("="):
# '=' alignment is invalid
msg = ("'=' alignment flag is not allowed in"
" complex format specifier")
raise OperationError(space.w_ValueError, space.wrap(msg))
- if self._fill_char == "0":
+ if self._fill_char == ord("0"):
#zero padding is invalid
msg = "Zero padding is not allowed in complex format specifier"
raise OperationError(space.w_ValueError, space.wrap(msg))
@@ -1047,7 +1060,7 @@
tmp_fill_char = self._fill_char
tmp_align = self._align
tmp_width = self._width
- self._fill_char = "\0"
+ self._fill_char = ord("\0")
self._align = "<"
self._width = -1
@@ -1058,8 +1071,8 @@
to_imag_number)
if self.is_unicode:
- re_num = re_num.decode("ascii")
- im_num = im_num.decode("ascii")
+ re_num = str_decode_ascii(re_num, len(re_num), 'strict')[0]
+ im_num = str_decode_ascii(im_num, len(im_num), 'strict')[0]
#set remainder, in CPython _parse_number sets this
#using n_re_digits causes tests to fail
@@ -1073,7 +1086,7 @@
#self._grouped_digits will get overwritten in imaginary calc_num_width
re_grouped_digits = self._grouped_digits
if not skip_re:
- self._sign = "+"
+ self._sign = ord("+")
im_spec = self._calc_num_width(0, im_sign, to_imag_number, n_im_digits,
im_n_remainder, im_have_dec,
im_num)
@@ -1093,14 +1106,14 @@
out = self._builder()
fill = self._fill_char
- if fill == "\0":
- fill = self._lit(" ")[0]
+ if fill == ord("\0"):
+ fill = ord(" ")
#compose the string
#add left padding
- out.append_multiple_char(fill, self._left_pad)
+ out.append_multiple_char(chr(fill), self._left_pad)
if add_parens:
- out.append(self._lit('(')[0])
+ out.append('(')
#if the no. has a real component, add it
if not skip_re:
@@ -1114,13 +1127,13 @@
im_grouped_digits))
#add 'j' character
- out.append(self._lit('j')[0])
+ out.append('j')
if add_parens:
- out.append(self._lit(')')[0])
+ out.append(')')
#add right padding
- out.append_multiple_char(fill, self._right_pad)
+ out.append_multiple_char(chr(fill), self._right_pad)
return self.space.wrap(out.build())
@@ -1131,15 +1144,8 @@
#parse format specification, set associated variables
if self._parse_spec("\0", ">"):
return space.str(w_complex)
- tp = self._type
- if (tp == "\0" or
- tp == "e" or
- tp == "E" or
- tp == "f" or
- tp == "F" or
- tp == "g" or
- tp == "G" or
- tp == "n"):
+
+ if self._type in ('\0', 'e', 'E', 'f', 'F', 'g', 'G', 'n'):
return self._format_complex(w_complex)
self._unknown_presentation("complex")
return Formatter
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -162,9 +162,6 @@
if isinstance(x, Utf8Str):
return wrapunicode(self, x)
- if isinstance(x, unicode):
- import pdb; pdb.set_trace()
-
if isinstance(x, float):
return W_FloatObject(x)
if isinstance(x, W_Root):
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -585,13 +585,13 @@
by = self._op_val(space, w_sep)
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
- res = self._split(value, by, maxsplit)
+ res = self._rsplit(value, by, maxsplit)
return self._newlist_unwrapped(space, res)
@staticmethod
def _rsplit(value, sep=None, maxsplit=-1):
- return value.split(sep, maxsplit)
+ return rsplit(value, sep, maxsplit)
@unwrap_spec(keepends=bool)
def descr_splitlines(self, space, keepends=False):
@@ -606,7 +606,8 @@
eol = pos
pos += 1
# read CRLF as one line break
- if pos < length and value[eol] == '\r' and value[pos] == '\n':
+ if (pos < length and ORD(value, eol) == ord('\r') and
+ ORD(value, pos) == ord('\n')):
pos += 1
if keepends:
eol = pos
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -3,6 +3,7 @@
from pypy.objspace.std.dictmultiobject import (W_DictMultiObject,
BytesDictStrategy, ObjectDictStrategy)
+from pypy.interpreter.utf8 import Utf8Str
class TestW_DictObject(object):
@@ -142,8 +143,9 @@
def test_listview_unicode_dict(self):
w = self.space.wrap
+ w_u = lambda x: w(Utf8Str.from_unicode(x))
w_d = self.space.newdict()
- w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(2))])
+ w_d.initialize_content([(w_u(u"a"), w(1)), (w_u(u"b"), w(2))])
assert self.space.listview_unicode(w_d) == [u"a", u"b"]
def test_listview_int_dict(self):
@@ -154,7 +156,8 @@
def test_keys_on_string_unicode_int_dict(self, monkeypatch):
w = self.space.wrap
-
+ w_u = lambda x: w(Utf8Str.from_unicode(x))
+
w_d = self.space.newdict()
w_d.initialize_content([(w(1), w("a")), (w(2), w("b"))])
w_l = self.space.call_method(w_d, "keys")
@@ -174,7 +177,7 @@
# but we need space.newlist_unicode for it
monkeypatch.undo()
w_d = self.space.newdict()
- w_d.initialize_content([(w(u"a"), w(1)), (w(u"b"), w(6))])
+ w_d.initialize_content([(w_u(u"a"), w(1)), (w_u(u"b"), w(6))])
w_l = self.space.call_method(w_d, "keys")
assert sorted(self.space.listview_unicode(w_l)) == [u"a", u"b"]
diff --git a/pypy/objspace/std/test/test_index.py b/pypy/objspace/std/test/test_index.py
--- a/pypy/objspace/std/test/test_index.py
+++ b/pypy/objspace/std/test/test_index.py
@@ -1,5 +1,7 @@
from py.test import raises
+from pypy.interpreter.utf8 import Utf8Str
+
class AppTest_IndexProtocol:
def setup_class(self):
w_oldstyle = self.space.appexec([], """():
@@ -263,7 +265,7 @@
class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase):
def setup_method(self, method):
SeqTestCase.setup_method(self, method)
- self.w_seq = self.space.wrap(u"this is a test")
+ self.w_seq = self.space.wrap(Utf8Str.from_unicode(u"this is a test"))
self.w_const = self.space.appexec([], """(): return unicode""")
diff --git a/pypy/objspace/std/test/test_lengthhint.py b/pypy/objspace/std/test/test_lengthhint.py
--- a/pypy/objspace/std/test/test_lengthhint.py
+++ b/pypy/objspace/std/test/test_lengthhint.py
@@ -1,3 +1,4 @@
+from pypy.interpreter.utf8 import Utf8Str
from pypy.module._collections.interp_deque import W_Deque
from pypy.module.itertools.interp_itertools import W_Repeat
@@ -71,7 +72,7 @@
self._test_length_hint(self.space.wrap('P' * self.SIZE))
def test_unicode(self):
- self._test_length_hint(self.space.wrap(u'Y' * self.SIZE))
+ self._test_length_hint(self.space.wrap(Utf8Str('Y' * self.SIZE)))
def test_tuple(self):
self._test_length_hint(self.space.wrap(tuple(self.ITEMS)))
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -1,4 +1,5 @@
import sys
+from pypy.interpreter.utf8 import Utf8Str
from pypy.objspace.std.listobject import (
W_ListObject, EmptyListStrategy, ObjectListStrategy, IntegerListStrategy,
FloatListStrategy, BytesListStrategy, RangeListStrategy,
@@ -11,20 +12,22 @@
def test_check_strategy(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
assert isinstance(W_ListObject(space, []).strategy, EmptyListStrategy)
assert isinstance(W_ListObject(space, [w(1),w('a')]).strategy, ObjectListStrategy)
assert isinstance(W_ListObject(space, [w(1),w(2),w(3)]).strategy,
IntegerListStrategy)
assert isinstance(W_ListObject(space, [w('a'), w('b')]).strategy,
BytesListStrategy)
- assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
+ assert isinstance(W_ListObject(space, [w_u('a'), w_u('b')]).strategy,
UnicodeListStrategy)
- assert isinstance(W_ListObject(space, [w(u'a'), w('b')]).strategy,
+ assert isinstance(W_ListObject(space, [w_u('a'), w('b')]).strategy,
ObjectListStrategy) # mixed unicode and bytes
def test_empty_to_any(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
l = W_ListObject(space, [])
assert isinstance(l.strategy, EmptyListStrategy)
l.append(w((1,3)))
@@ -42,7 +45,7 @@
l = W_ListObject(space, [])
assert isinstance(l.strategy, EmptyListStrategy)
- l.append(w(u'a'))
+ l.append(w_u('a'))
assert isinstance(l.strategy, UnicodeListStrategy)
l = W_ListObject(space, [])
@@ -70,9 +73,10 @@
def test_unicode_to_any(self):
space = self.space
- l = W_ListObject(space, [space.wrap(u'a'), space.wrap(u'b'), space.wrap(u'c')])
+ w_u = lambda x: space.wrap(Utf8Str(x))
+ l = W_ListObject(space, [w_u('a'), w_u('b'), w_u('c')])
assert isinstance(l.strategy, UnicodeListStrategy)
- l.append(space.wrap(u'd'))
+ l.append(w_u('d'))
assert isinstance(l.strategy, UnicodeListStrategy)
l.append(space.wrap(3))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -89,6 +93,7 @@
def test_setitem(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
# This should work if test_listobject.py passes
l = W_ListObject(space, [w('a'),w('b'),w('c')])
assert space.eq_w(l.getitem(0), w('a'))
@@ -110,7 +115,7 @@
assert isinstance(l.strategy, ObjectListStrategy)
# UnicodeStrategy to ObjectStrategy
- l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
+ l = W_ListObject(space, [w_u('a'),w_u('b'),w_u('c')])
assert isinstance(l.strategy, UnicodeListStrategy)
l.setitem(0, w(2))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -124,6 +129,7 @@
def test_insert(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
# no change
l = W_ListObject(space, [w(1),w(2),w(3)])
assert isinstance(l.strategy, IntegerListStrategy)
@@ -137,7 +143,7 @@
assert isinstance(l.strategy, ObjectListStrategy)
# UnicodeStrategy
- l = W_ListObject(space, [w(u'a'),w(u'b'),w(u'c')])
+ l = W_ListObject(space, [w_u('a'),w_u('b'),w_u('c')])
assert isinstance(l.strategy, UnicodeListStrategy)
l.insert(3, w(2))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -186,6 +192,7 @@
def test_setslice(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
l = W_ListObject(space, [])
assert isinstance(l.strategy, EmptyListStrategy)
@@ -217,7 +224,7 @@
assert isinstance(l.strategy, ObjectListStrategy)
# UnicodeStrategy to ObjectStrategy
- l = W_ListObject(space, [w(u'a'), w(u'b'), w(u'c')])
+ l = W_ListObject(space, [w_u('a'), w_u('b'), w_u('c')])
assert isinstance(l.strategy, UnicodeListStrategy)
l.setslice(0, 1, 2, W_ListObject(space, [w(1), w(2), w(3)]))
assert isinstance(l.strategy, ObjectListStrategy)
@@ -242,6 +249,8 @@
def wrapitems(items):
items_w = []
for i in items:
+ if isinstance(i, unicode):
+ i = Utf8Str.from_unicode(i)
items_w.append(space.wrap(i))
return items_w
@@ -323,6 +332,7 @@
def test_empty_extend_with_any(self):
space = self.space
w = space.wrap
+ w_u = lambda x: w(Utf8Str(x))
empty = W_ListObject(space, [])
assert isinstance(empty.strategy, EmptyListStrategy)
@@ -336,7 +346,7 @@
empty = W_ListObject(space, [])
assert isinstance(empty.strategy, EmptyListStrategy)
- empty.extend(W_ListObject(space, [w(u"a"), w(u"b"), w(u"c")]))
+ empty.extend(W_ListObject(space, [w_u("a"), w_u("b"), w_u("c")]))
assert isinstance(empty.strategy, UnicodeListStrategy)
empty = W_ListObject(space, [])
@@ -588,11 +598,12 @@
assert self.space.eq_w(l3, W_ListObject(self.space, [self.space.wrap(1), self.space.wrap(2), self.space.wrap(3), self.space.wrap(4), self.space.wrap(5)]))
def test_unicode(self):
+ wrap_u = lambda x: self.space.wrap(Utf8Str(x))
l1 = W_ListObject(self.space, [self.space.wrap("eins"), self.space.wrap("zwei")])
assert isinstance(l1.strategy, BytesListStrategy)
- l2 = W_ListObject(self.space, [self.space.wrap(u"eins"), self.space.wrap(u"zwei")])
+ l2 = W_ListObject(self.space, [wrap_u("eins"), wrap_u("zwei")])
assert isinstance(l2.strategy, UnicodeListStrategy)
- l3 = W_ListObject(self.space, [self.space.wrap("eins"), self.space.wrap(u"zwei")])
+ l3 = W_ListObject(self.space, [self.space.wrap("eins"), wrap_u(u"zwei")])
assert isinstance(l3.strategy, ObjectListStrategy)
def test_listview_bytes(self):
@@ -603,20 +614,22 @@
def test_listview_unicode(self):
space = self.space
+ wrap_u = lambda x: self.space.wrap(Utf8Str(x))
assert space.listview_unicode(space.wrap(1)) == None
- w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
+ w_l = self.space.newlist([wrap_u('a'), wrap_u('b')])
assert space.listview_unicode(w_l) == [u"a", u"b"]
def test_string_join_uses_listview_bytes(self):
space = self.space
+ wrap_u = lambda x: self.space.wrap(Utf8Str(x))
w_l = self.space.newlist([self.space.wrap('a'), self.space.wrap('b')])
w_l.getitems = None
assert space.str_w(space.call_method(space.wrap("c"), "join", w_l)) == "acb"
#
# the same for unicode
- w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')])
+ w_l = self.space.newlist([wrap_u('a'), wrap_u('b')])
w_l.getitems = None
- assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb"
+ assert space.unicode_w(space.call_method(wrap_u("c"), "join", w_l)) == u"acb"
def test_string_join_returns_same_instance(self):
space = self.space
@@ -626,10 +639,11 @@
assert space.is_w(space.call_method(space.wrap(" -- "), "join", w_l), w_text)
#
# the same for unicode
- w_text = space.wrap(u"text")
+ w_base = space.wrap(Utf8Str(" -- "))
+ w_text = space.wrap(Utf8Str("text"))
w_l = self.space.newlist([w_text])
w_l.getitems = None
- assert space.is_w(space.call_method(space.wrap(u" -- "), "join", w_l), w_text)
+ assert space.is_w(space.call_method(w_base, "join", w_l), w_text)
def test_newlist_bytes(self):
space = self.space
@@ -656,7 +670,7 @@
def test_unicode_uses_newlist_unicode(self):
space = self.space
- w_u = space.wrap(u"a b c")
+ w_u = space.wrap(Utf8Str("a b c"))
space.newlist = None
try:
w_l = space.call_method(w_u, "split")
@@ -711,7 +725,8 @@
def test_listview_unicode_list(self):
space = self.space
- w_l = W_ListObject(space, [space.wrap(u"a"), space.wrap(u"b")])
+ wrap_u = lambda x: self.space.wrap(Utf8Str(x))
+ w_l = W_ListObject(space, [wrap_u("a"), wrap_u("b")])
assert self.space.listview_unicode(w_l) == [u"a", u"b"]
def test_listview_int_list(self):
diff --git a/pypy/objspace/std/test/test_newformat.py b/pypy/objspace/std/test/test_newformat.py
--- a/pypy/objspace/std/test/test_newformat.py
+++ b/pypy/objspace/std/test/test_newformat.py
@@ -382,30 +382,30 @@
assert l == [('abcd', None, None, None)]
#
l = list('ab{0}cd'._formatter_parser())
- assert l == [('ab', '0', '', None), ('cd', None, None, None)]
+ assert l == [('ab', '0', '', -1), ('cd', None, None, None)]
#
l = list('{0}cd'._formatter_parser())
- assert l == [('', '0', '', None), ('cd', None, None, None)]
+ assert l == [('', '0', '', -1), ('cd', None, None, None)]
#
l = list('ab{0}'._formatter_parser())
- assert l == [('ab', '0', '', None)]
+ assert l == [('ab', '0', '', -1)]
#
l = list(''._formatter_parser())
assert l == []
#
l = list('{0:123}'._formatter_parser())
- assert l == [('', '0', '123', None)]
+ assert l == [('', '0', '123', -1)]
#
l = list('{0!x:123}'._formatter_parser())
- assert l == [('', '0', '123', 'x')]
+ assert l == [('', '0', '123', ord('x'))]
#
l = list('{0!x:12{sdd}3}'._formatter_parser())
- assert l == [('', '0', '12{sdd}3', 'x')]
+ assert l == [('', '0', '12{sdd}3', ord('x'))]
def test_u_formatter_parser(self):
l = list(u'{0!x:12{sdd}3}'._formatter_parser())
- assert l == [(u'', u'0', u'12{sdd}3', u'x')]
- for x in l[0]:
+ assert l == [(u'', u'0', u'12{sdd}3', ord(u'x'))]
+ for x in l[0][:-1]:
assert isinstance(x, unicode)
def test_formatter_parser_escape(self):
More information about the pypy-commit
mailing list