[pypy-commit] pypy unicode-utf8: merge heads
arigo
pypy.commits at gmail.com
Mon Nov 27 16:17:18 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93186:350cb9b5b92b
Date: 2017-11-27 22:16 +0100
http://bitbucket.org/pypy/pypy/changeset/350cb9b5b92b/
Log: merge heads
diff too long, truncating to 2000 out of 2094 lines
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -9,3 +9,5 @@
* remove assertions from W_UnicodeObject.__init__ if all the builders pass
* what to do with error handlers that go backwards. There were tests
in test_codecs that would check for that
+
+* fix _pypyjson to not use a wrapped dict when decoding an object
diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_textio.py
@@ -0,0 +1,28 @@
+from hypothesis import given, strategies as st
+
+from io import BytesIO, TextIOWrapper
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+ sep = draw(st.sampled_from(LINESEP))
+ lines = draw(st.lists(st.text(max_size=10), max_size=10))
+ return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+ mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+ limit=st.integers(min_value=-1))
+def test_readline(txt, mode, limit):
+ textio = TextIOWrapper(
+ BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode)
+ lines = []
+ while True:
+ line = textio.readline(limit)
+ if limit > 0:
+ assert len(line) < limit
+ if line:
+ lines.append(line)
+ else:
+ break
+ assert u''.join(lines) == txt
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1760,10 +1760,6 @@
def utf8_w(self, w_obj):
return w_obj.utf8_w(self)
- def unicode_w(self, w_obj):
- # XXX: kill me!
- return w_obj.utf8_w(self).decode('utf-8')
-
def convert_to_w_unicode(self, w_obj):
return w_obj.convert_to_w_unicode(self)
diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+import sys
+
+def pytest_configure(config):
+ if sys.platform.startswith('linux'):
+ from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux
+ configure_libbacktrace_linux()
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -2,21 +2,115 @@
from pypy.interpreter.typedef import (
TypeDef, generic_new_descr, GetSetProperty)
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
-from pypy.module._io.interp_textio import W_TextIOBase, W_IncrementalNewlineDecoder
+from pypy.module._io.interp_textio import (
+ W_TextIOBase, W_IncrementalNewlineDecoder)
from pypy.module._io.interp_iobase import convert_size
+class UnicodeIO(object):
+ def __init__(self, data=None, pos=0):
+ if data is None:
+ data = []
+ self.data = data
+ self.pos = pos
+
+ def resize(self, newlength):
+ if len(self.data) > newlength:
+ self.data = self.data[:newlength]
+ if len(self.data) < newlength:
+ self.data.extend([u'\0'] * (newlength - len(self.data)))
+
+ def read(self, size):
+ start = self.pos
+ available = len(self.data) - start
+ if available <= 0:
+ return u''
+ if size >= 0 and size <= available:
+ end = start + size
+ else:
+ end = len(self.data)
+ assert 0 <= start <= end
+ self.pos = end
+ return u''.join(self.data[start:end])
+
+ def _convert_limit(self, limit):
+ if limit < 0 or limit > len(self.data) - self.pos:
+ limit = len(self.data) - self.pos
+ assert limit >= 0
+ return limit
+
+ def readline_universal(self, limit):
+ # Universal newline search. Find any of \r, \r\n, \n
+ limit = self._convert_limit(limit)
+ start = self.pos
+ end = start + limit
+ pos = start
+ while pos < end:
+ ch = self.data[pos]
+ pos += 1
+ if ch == '\n':
+ break
+ if ch == '\r':
+ if pos >= end:
+ break
+ if self.data[pos] == '\n':
+ pos += 1
+ break
+ else:
+ break
+ self.pos = pos
+ result = u''.join(self.data[start:pos])
+ return result
+
+ def readline(self, marker, limit):
+ start = self.pos
+ limit = self._convert_limit(limit)
+ end = start + limit
+ found = False
+ for pos in range(start, end - len(marker) + 1):
+ ch = self.data[pos]
+ if ch == marker[0]:
+ for j in range(1, len(marker)):
+ if self.data[pos + j] != marker[j]:
+ break # from inner loop
+ else:
+ pos += len(marker)
+ found = True
+ break
+ if not found:
+ pos = end
+ self.pos = pos
+ result = u''.join(self.data[start:pos])
+ return result
+
+ def write(self, string):
+ length = len(string)
+ if self.pos + length > len(self.data):
+ self.resize(self.pos + length)
+
+ for i in range(length):
+ self.data[self.pos + i] = string[i]
+ self.pos += length
+
+ def seek(self, pos):
+ self.pos = pos
+
+ def truncate(self, size):
+ if size < len(self.data):
+ self.resize(size)
+
+ def getvalue(self):
+ return u''.join(self.data)
+
class W_StringIO(W_TextIOBase):
def __init__(self, space):
W_TextIOBase.__init__(self, space)
- self.buf = []
- self.pos = 0
+ self.buf = UnicodeIO()
- @unwrap_spec(w_newline = WrappedDefault("\n"))
+ @unwrap_spec(w_newline=WrappedDefault("\n"))
def descr_init(self, space, w_initvalue=None, w_newline=None):
# In case __init__ is called multiple times
- self.buf = []
- self.pos = 0
+ self.buf = UnicodeIO()
self.w_decoder = None
self.readnl = None
self.writenl = None
@@ -27,7 +121,7 @@
newline = space.unicode_w(w_newline)
if (newline is not None and newline != u"" and newline != u"\n" and
- newline != u"\r" and newline != u"\r\n"):
+ newline != u"\r" and newline != u"\r\n"):
# Not using oefmt() because I don't know how to use it
# with unicode
raise OperationError(space.w_ValueError,
@@ -50,7 +144,7 @@
if not space.is_none(w_initvalue):
self.write_w(space, w_initvalue)
- self.pos = 0
+ self.buf.pos = 0
def descr_getstate(self, space):
w_initialval = self.getvalue_w(space)
@@ -58,9 +152,9 @@
if self.readnl is None:
w_readnl = space.w_None
else:
- w_readnl = space.str(space.newunicode(self.readnl)) # YYY
+ w_readnl = space.str(space.newunicode(self.readnl)) # YYY
return space.newtuple([
- w_initialval, w_readnl, space.newint(self.pos), w_dict
+ w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
])
def descr_setstate(self, space, w_state):
@@ -69,34 +163,33 @@
# We allow the state tuple to be longer than 4, because we may need
# someday to extend the object's state without breaking
# backwards-compatibility
- if not space.isinstance_w(w_state, space.w_tuple) or space.len_w(w_state) < 4:
+ if (not space.isinstance_w(w_state, space.w_tuple)
+ or space.len_w(w_state) < 4):
raise oefmt(space.w_TypeError,
"%T.__setstate__ argument should be a 4-tuple, got %T",
self, w_state)
w_initval, w_readnl, w_pos, w_dict = space.unpackiterable(w_state, 4)
+ if not space.isinstance_w(w_initval, space.w_unicode):
+ raise oefmt(space.w_TypeError,
+ "unicode argument expected, got '%T'", w_initval)
# Initialize state
- self.descr_init(space, w_initval, w_readnl)
+ self.descr_init(space, None, w_readnl)
- # Restore the buffer state. Even if __init__ did initialize the buffer,
- # we have to initialize it again since __init__ may translates the
- # newlines in the inital_value string. We clearly do not want that
+ # Restore the buffer state. We're not doing it via __init__
# because the string value in the state tuple has already been
# translated once by __init__. So we do not take any chance and replace
# object's buffer completely
initval = space.unicode_w(w_initval)
- size = len(initval)
- self.resize_buffer(size)
- self.buf = list(initval)
pos = space.getindex_w(w_pos, space.w_TypeError)
if pos < 0:
raise oefmt(space.w_ValueError,
"position value cannot be negative")
- self.pos = pos
+ self.buf = UnicodeIO(list(initval), pos)
if not space.is_w(w_dict, space.w_None):
if not space.isinstance_w(w_dict, space.w_dict):
- raise oefmt(space.w_TypeError,
- "fourth item of state should be a dict, got a %T",
- w_dict)
+ raise oefmt(
+ space.w_TypeError,
+ "fourth item of state should be a dict, got a %T", w_dict)
# Alternatively, we could replace the internal dictionary
# completely. However, it seems more practical to just update it.
space.call_method(self.w_dict, "update", w_dict)
@@ -107,88 +200,47 @@
message = "I/O operation on closed file"
raise OperationError(space.w_ValueError, space.newtext(message))
- def resize_buffer(self, newlength):
- if len(self.buf) > newlength:
- self.buf = self.buf[:newlength]
- if len(self.buf) < newlength:
- self.buf.extend([u'\0'] * (newlength - len(self.buf)))
-
- def write(self, string):
- length = len(string)
- if self.pos + length > len(self.buf):
- self.resize_buffer(self.pos + length)
-
- for i in range(length):
- self.buf[self.pos + i] = string[i]
- self.pos += length
-
def write_w(self, space, w_obj):
if not space.isinstance_w(w_obj, space.w_unicode):
raise oefmt(space.w_TypeError,
"unicode argument expected, got '%T'", w_obj)
self._check_closed(space)
-
orig_size = space.len_w(w_obj)
if self.w_decoder is not None:
w_decoded = space.call_method(
- self.w_decoder, "decode", w_obj, space.w_True
- )
+ self.w_decoder, "decode", w_obj, space.w_True)
else:
w_decoded = w_obj
-
if self.writenl:
w_decoded = space.call_method(
- w_decoded, "replace", space.newtext("\n"), space.newunicode(self.writenl)
- )
+ w_decoded, "replace",
+ space.newtext("\n"), space.newunicode(self.writenl))
+ string = space.unicode_w(w_decoded)
+ if string:
+ self.buf.write(string)
- string = space.unicode_w(w_decoded)
- size = len(string)
-
- if size:
- self.write(string)
return space.newint(orig_size)
def read_w(self, space, w_size=None):
self._check_closed(space)
size = convert_size(space, w_size)
- start = self.pos
- available = len(self.buf) - start
- if available <= 0:
- return space.newunicode(u"")
- if size >= 0 and size <= available:
- end = start + size
- else:
- end = len(self.buf)
- assert 0 <= start <= end
- self.pos = end
- return space.newunicode(u''.join(self.buf[start:end]))
+ return space.newunicode(self.buf.read(size))
def readline_w(self, space, w_limit=None):
self._check_closed(space)
limit = convert_size(space, w_limit)
+ if self.readuniversal:
+ result = self.buf.readline_universal(limit)
+ else:
+ if self.readtranslate:
+ # Newlines are already translated, only search for \n
+ newline = u'\n'
+ else:
+ newline = self.readnl
+ result = self.buf.readline(newline, limit)
+ return space.newunicode(result)
- if self.pos >= len(self.buf):
- return space.newunicode(u"")
-
- start = self.pos
- if limit < 0 or limit > len(self.buf) - self.pos:
- limit = len(self.buf) - self.pos
-
- assert limit >= 0
- end = start + limit
-
- endpos, consumed = self._find_line_ending(
- # XXX: super inefficient, makes a copy of the entire contents.
- u"".join(self.buf),
- start,
- end
- )
- if endpos < 0:
- endpos = end
- assert endpos >= 0
- self.pos = endpos
- return space.newunicode(u"".join(self.buf[start:endpos]))
@unwrap_spec(pos=int, mode=int)
def seek_w(self, space, pos, mode=0):
@@ -204,32 +256,27 @@
# XXX: this makes almost no sense, but its how CPython does it.
if mode == 1:
- pos = self.pos
+ pos = self.buf.pos
elif mode == 2:
- pos = len(self.buf)
-
+ pos = len(self.buf.data)
assert pos >= 0
- self.pos = pos
+ self.buf.seek(pos)
return space.newint(pos)
def truncate_w(self, space, w_size=None):
self._check_closed(space)
if space.is_none(w_size):
- size = self.pos
+ size = self.buf.pos
else:
size = space.int_w(w_size)
-
if size < 0:
raise oefmt(space.w_ValueError, "Negative size value %d", size)
-
- if size < len(self.buf):
- self.resize_buffer(size)
-
+ self.buf.truncate(size)
return space.newint(size)
def getvalue_w(self, space):
self._check_closed(space)
- return space.newunicode(u''.join(self.buf))
+ return space.newunicode(self.buf.getvalue())
def readable_w(self, space):
self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -221,46 +221,6 @@
def newlines_get_w(self, space):
return space.w_None
- def _find_line_ending(self, line, start, end):
- size = end - start
- if self.readtranslate:
- # Newlines are already translated, only search for \n
- pos = line.find('\n', start, end)
- if pos >= 0:
- return pos + 1, 0
- else:
- return -1, size
- elif self.readuniversal:
- # Universal newline search. Find any of \r, \r\n, \n
- # The decoder ensures that \r\n are not split in two pieces
- i = start
- while True:
- # Fast path for non-control chars.
- while i < end and line[i] > '\r':
- i += 1
- if i >= end:
- return -1, size
- ch = line[i]
- i += 1
- if ch == '\n':
- return i, 0
- if ch == '\r':
- if line[i] == '\n':
- return i + 1, 0
- else:
- return i, 0
- else:
- # Non-universal mode.
- pos = line.find(self.readnl, start, end)
- if pos >= 0:
- return pos + len(self.readnl), 0
- else:
- pos = line.find(self.readnl[0], start, end)
- if pos >= 0:
- return -1, pos - start
- return -1, size
-
-
W_TextIOBase.typedef = TypeDef(
'_io._TextIOBase', W_IOBase.typedef,
__new__ = generic_new_descr(W_TextIOBase),
@@ -336,6 +296,126 @@
self.input = input
+class DecodeBuffer(object):
+ def __init__(self, text=None):
+ self.text = text
+ self.pos = 0
+
+ def set(self, space, w_decoded):
+ check_decoded(space, w_decoded)
+ self.text = space.unicode_w(w_decoded)
+ self.pos = 0
+
+ def reset(self):
+ self.text = None
+ self.pos = 0
+
+ def get_chars(self, size):
+ if self.text is None:
+ return u""
+
+ available = len(self.text) - self.pos
+ if size < 0 or size > available:
+ size = available
+ assert size >= 0
+
+ if self.pos > 0 or size < available:
+ start = self.pos
+ end = self.pos + size
+ assert start >= 0
+ assert end >= 0
+ chars = self.text[start:end]
+ else:
+ chars = self.text
+
+ self.pos += size
+ return chars
+
+ def has_data(self):
+ return (self.text is not None and not self.exhausted())
+
+ def exhausted(self):
+ return self.pos >= len(self.text)
+
+ def next_char(self):
+ if self.exhausted():
+ raise StopIteration
+ ch = self.text[self.pos]
+ self.pos += 1
+ return ch
+
+ def peek_char(self):
+ # like next_char, but doesn't advance pos
+ if self.exhausted():
+ raise StopIteration
+ ch = self.text[self.pos]
+ return ch
+
+ def find_newline_universal(self, limit):
+ # Universal newline search. Find any of \r, \r\n, \n
+ # The decoder ensures that \r\n are not split in two pieces
+ if limit < 0:
+ limit = sys.maxint
+ scanned = 0
+ while scanned < limit:
+ try:
+ ch = self.next_char()
+ except StopIteration:
+ return False
+ if ch == u'\n':
+ return True
+ if ch == u'\r':
+ if scanned >= limit:
+ return False
+ try:
+ ch = self.peek_char()
+ except StopIteration:
+ return False
+ if ch == u'\n':
+ self.next_char()
+ return True
+ else:
+ return True
+ return False
+
+ def find_crlf(self, limit):
+ if limit < 0:
+ limit = sys.maxint
+ scanned = 0
+ while scanned < limit:
+ try:
+ ch = self.next_char()
+ except StopIteration:
+ return False
+ scanned += 1
+ if ch == u'\r':
+ if scanned >= limit:
+ return False
+ try:
+ if self.peek_char() == u'\n':
+ self.next_char()
+ return True
+ except StopIteration:
+ # This is the tricky case: we found a \r right at the end
+ self.pos -= 1
+ return False
+ return False
+
+ def find_char(self, marker, limit):
+ if limit < 0:
+ limit = sys.maxint
+ scanned = 0
+ while scanned < limit:
+ try:
+ ch = self.next_char()
+ except StopIteration:
+ return False
+ if ch == marker:
+ return True
+ scanned += 1
+ return False
+
+
def check_decoded(space, w_decoded):
if not space.isinstance_w(w_decoded, space.w_unicode):
msg = "decoder should return a string result, not '%T'"
@@ -349,8 +429,7 @@
self.w_encoder = None
self.w_decoder = None
- self.decoded_chars = None # buffer for text returned from decoder
- self.decoded_chars_used = 0 # offset into _decoded_chars for read()
+ self.decoded = DecodeBuffer()
self.pending_bytes = None # list of bytes objects waiting to be
# written, or NULL
self.chunk_size = 8192
@@ -518,40 +597,10 @@
# _____________________________________________________________
# read methods
- def _unset_decoded(self):
- self.decoded_chars = None
- self.decoded_chars_used = 0
-
- def _set_decoded(self, space, w_decoded):
- check_decoded(space, w_decoded)
- self.decoded_chars = space.utf8_w(w_decoded)
- self.decoded_chars_used = 0
-
- def _get_decoded_chars(self, size):
- if self.decoded_chars is None:
- return ""
-
- available = len(self.decoded_chars) - self.decoded_chars_used
- if size < 0 or size > available:
- size = available
- assert size >= 0
-
- if self.decoded_chars_used > 0 or size < available:
- start = self.decoded_chars_used
- end = self.decoded_chars_used + size
- assert start >= 0
- assert end >= 0
- chars = self.decoded_chars[start:end]
- else:
- chars = self.decoded_chars
-
- self.decoded_chars_used += size
- return chars
-
def _read_chunk(self, space):
"""Read and decode the next chunk of data from the BufferedReader.
The return value is True unless EOF was reached. The decoded string
- is placed in self._decoded_chars (replacing its previous value).
+ is placed in self.decoded (replacing its previous value).
The entire input chunk is sent to the decoder, though some of it may
remain buffered in the decoder, yet to be converted."""
@@ -571,7 +620,7 @@
dec_buffer = None
dec_flags = 0
- # Read a chunk, decode it, and put the result in self._decoded_chars
+ # Read a chunk, decode it, and put the result in self.decoded
w_input = space.call_method(self.w_buffer, "read1",
space.newint(self.chunk_size))
@@ -583,7 +632,7 @@
eof = space.len_w(w_input) == 0
w_decoded = space.call_method(self.w_decoder, "decode",
w_input, space.newbool(eof))
- self._set_decoded(space, w_decoded)
+ self.decoded.set(space, w_decoded)
if space.len_w(w_decoded) > 0:
eof = False
@@ -595,6 +644,19 @@
return not eof
+ def _ensure_data(self, space):
+ while not self.decoded.has_data():
+ try:
+ if not self._read_chunk(space):
+ self.decoded.reset()
+ self.snapshot = None
+ return False
+ except OperationError as e:
+ if trap_eintr(space, e):
+ continue
+ raise
+ return True
+
def next_w(self, space):
self._check_attached(space)
self.telling = False
@@ -619,7 +681,7 @@
w_bytes = space.call_method(self.w_buffer, "read")
w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True)
check_decoded(space, w_decoded)
- w_result = space.new_from_utf8(self._get_decoded_chars(-1))
+ w_result = space.new_from_utf8(self.decoded.get_chars(-1))
w_final = space.add(w_result, w_decoded)
self.snapshot = None
return w_final
@@ -628,24 +690,29 @@
builder = StringBuilder(size)
# Keep reading chunks until we have n characters to return
- while True:
- data = self._get_decoded_chars(remaining)
+ while remaining > 0:
+ if not self._ensure_data(space):
+ break
+ data = self.decoded.get_chars(remaining)
builder.append(data)
remaining -= len(data)
- if remaining <= 0: # Done
- break
+ return space.new_from_utf8(builder.build())
- try:
- if not self._read_chunk(space):
- # EOF
- break
- except OperationError as e:
- if trap_eintr(space, e):
- continue
- raise
-
- return space.new_from_utf8(builder.build())
+ def _scan_line_ending(self, limit):
+ if self.readuniversal:
+ return self.decoded.find_newline_universal(limit)
+ else:
+ if self.readtranslate:
+ # Newlines are already translated, only search for \n
+ newline = u'\n'
+ else:
+ # Non-universal mode.
+ newline = self.readnl
+ if newline == u'\r\n':
+ return self.decoded.find_crlf(limit)
+ else:
+ return self.decoded.find_char(newline[0], limit)
def readline_w(self, space, w_limit=None):
self._check_attached(space)
@@ -653,82 +720,52 @@
self._writeflush(space)
limit = convert_size(space, w_limit)
-
- line = None
- remaining = None
+ remnant = None
builder = StringBuilder()
-
while True:
# First, get some data if necessary
- has_data = True
- while not self.decoded_chars:
- try:
- if not self._read_chunk(space):
- has_data = False
- break
- except OperationError as e:
- if trap_eintr(space, e):
- continue
- raise
+ has_data = self._ensure_data(space)
if not has_data:
# end of file
- self._unset_decoded()
- self.snapshot = None
- start = endpos = offset_to_buffer = 0
+ if remnant:
+ builder.append(remnant)
break
- if not remaining:
- line = self.decoded_chars
- start = self.decoded_chars_used
- offset_to_buffer = 0
+ if remnant:
+ assert not self.readtranslate and self.readnl == '\r\n'
+ assert self.decoded.pos == 0
+ if remnant == '\r' and self.decoded.text[0] == '\n':
+ builder.append('\r\n')
+ self.decoded.pos = 1
+ remnant = None
+ break
+ else:
+ builder.append(remnant)
+ remnant = None
+ continue
+
+ if limit > 0:
+ remaining = limit - builder.getlength()
+ assert remaining >= 0
else:
- assert self.decoded_chars_used == 0
- line = remaining + self.decoded_chars
- start = 0
- offset_to_buffer = len(remaining)
- remaining = None
+ remaining = -1
+ start = self.decoded.pos
+ assert start >= 0
+ found = self._scan_line_ending(remaining)
+ end_scan = self.decoded.pos
+ if end_scan > start:
+ s = self.decoded.text[start:end_scan]
+ builder.append(s)
- line_len = len(line)
- endpos, consumed = self._find_line_ending(line, start, line_len)
- chunked = builder.getlength()
- if endpos >= 0:
- if limit >= 0 and endpos >= start + limit - chunked:
- endpos = start + limit - chunked
- assert endpos >= 0
- break
- assert consumed >= 0
-
- # We can put aside up to `endpos`
- endpos = consumed + start
- if limit >= 0 and endpos >= start + limit - chunked:
- # Didn't find line ending, but reached length limit
- endpos = start + limit - chunked
- assert endpos >= 0
+ if found or (limit >= 0 and builder.getlength() >= limit):
break
- # No line ending seen yet - put aside current data
- if endpos > start:
- s = line[start:endpos]
- builder.append(s)
-
- # There may be some remaining bytes we'll have to prepend to the
+ # There may be some remaining chars we'll have to prepend to the
# next chunk of data
- if endpos < line_len:
- remaining = line[endpos:]
- line = None
+ if not self.decoded.exhausted():
+ remnant = self.decoded.get_chars(-1)
# We have consumed the buffer
- self._unset_decoded()
-
- if line:
- # Our line ends in the current buffer
- decoded_chars_used = endpos - offset_to_buffer
- assert decoded_chars_used >= 0
- self.decoded_chars_used = decoded_chars_used
- if start > 0 or endpos < len(line):
- line = line[start:endpos]
- builder.append(line)
- elif remaining:
- builder.append(remaining)
+ self.decoded.reset()
result = builder.build()
return space.new_from_utf8(result)
@@ -862,7 +899,7 @@
raise oefmt(space.w_IOError,
"can't do nonzero end-relative seeks")
space.call_method(self, "flush")
- self._unset_decoded()
+ self.decoded.reset()
self.snapshot = None
if self.w_decoder:
space.call_method(self.w_decoder, "reset")
@@ -887,7 +924,7 @@
# Seek back to the safe start point
space.call_method(self.w_buffer, "seek", space.newint(cookie.start_pos))
- self._unset_decoded()
+ self.decoded.reset()
self.snapshot = None
# Restore the decoder to its state from the safe start point.
@@ -908,13 +945,13 @@
w_decoded = space.call_method(self.w_decoder, "decode",
w_chunk, space.newbool(bool(cookie.need_eof)))
- self._set_decoded(space, w_decoded)
+ self.decoded.set(space, w_decoded)
# Skip chars_to_skip of the decoded characters
- if len(self.decoded_chars) < cookie.chars_to_skip:
+ if len(self.decoded.text) < cookie.chars_to_skip:
raise oefmt(space.w_IOError,
"can't restore logical file position")
- self.decoded_chars_used = cookie.chars_to_skip
+ self.decoded.pos = cookie.chars_to_skip
else:
self.snapshot = PositionSnapshot(cookie.dec_flags, "")
@@ -940,7 +977,7 @@
w_pos = space.call_method(self.w_buffer, "tell")
if self.w_decoder is None or self.snapshot is None:
- assert not self.decoded_chars
+ assert not self.decoded.text
return w_pos
cookie = PositionCookie(space.bigint_w(w_pos))
@@ -951,11 +988,11 @@
cookie.start_pos -= len(input)
# How many decoded characters have been used up since the snapshot?
- if not self.decoded_chars_used:
+ if not self.decoded.pos:
# We haven't moved from the snapshot point.
return space.newlong_from_rbigint(cookie.pack())
- chars_to_skip = self.decoded_chars_used
+ chars_to_skip = self.decoded.pos
# Starting from the snapshot position, we will walk the decoder
# forward until it gives us enough decoded characters.
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -0,0 +1,68 @@
+import pytest
+try:
+ from hypothesis import given, strategies as st, assume
+except ImportError:
+ pytest.skip("hypothesis required")
+from pypy.module._io.interp_bytesio import W_BytesIO
+from pypy.module._io.interp_textio import W_TextIOWrapper, DecodeBuffer
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+ sep = draw(st.sampled_from(LINESEP))
+ lines = draw(st.lists(st.text(max_size=10), max_size=10))
+ return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+ mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+ limit=st.integers(min_value=-1))
+def test_readline(space, txt, mode, limit):
+ assume(limit != 0)
+ w_stream = W_BytesIO(space)
+ w_stream.descr_init(space, space.newbytes(txt.encode('utf-8')))
+ w_textio = W_TextIOWrapper(space)
+ w_textio.descr_init(
+ space, w_stream, encoding='utf-8',
+ w_newline=space.newtext(mode))
+ lines = []
+ while True:
+ line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+ if limit > 0:
+ assert len(line) <= limit
+ if line:
+ lines.append(line)
+ else:
+ break
+ assert u''.join(lines) == txt
+
+ at given(st.text())
+def test_read_buffer(text):
+ buf = DecodeBuffer(text)
+ assert buf.get_chars(-1) == text
+ assert buf.exhausted()
+
+ at given(st.text(), st.lists(st.integers(min_value=0)))
+def test_readn_buffer(text, sizes):
+ buf = DecodeBuffer(text)
+ strings = []
+ for n in sizes:
+ s = buf.get_chars(n)
+ if not buf.exhausted():
+ assert len(s) == n
+ else:
+ assert len(s) <= n
+ strings.append(s)
+ assert ''.join(strings) == text[:sum(sizes)]
+
+ at given(st.text())
+def test_next_char(text):
+ buf = DecodeBuffer(text)
+ chars = []
+ try:
+ while True:
+ chars.append(buf.next_char())
+ except StopIteration:
+ pass
+ assert buf.exhausted()
+ assert u''.join(chars) == text
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -197,19 +197,21 @@
MBENC_FLUSH = 1
MBENC_RESET = 2
-def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+def encode(codec, unicodedata, length, errors="strict", errorcb=None,
+ namecb=None):
encodebuf = pypy_cjk_enc_new(codec)
if not encodebuf:
raise MemoryError
try:
- return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+ return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
finally:
pypy_cjk_enc_free(encodebuf)
-def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
namecb=None, ignore_error=0):
- inleft = len(unicodedata)
- with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
+ inleft = length
+ inbuf = rffi.utf82wcharp(utf8data, length)
+ try:
if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
raise MemoryError
if ignore_error == 0:
@@ -221,16 +223,18 @@
if r == 0 or r == ignore_error:
break
multibytecodec_encerror(encodebuf, r, errors,
- errorcb, namecb, unicodedata)
+ errorcb, namecb, utf8data)
while flags & MBENC_RESET:
r = pypy_cjk_enc_reset(encodebuf)
if r == 0:
break
multibytecodec_encerror(encodebuf, r, errors,
- errorcb, namecb, unicodedata)
+ errorcb, namecb, utf8data)
src = pypy_cjk_enc_outbuf(encodebuf)
length = pypy_cjk_enc_outlen(encodebuf)
return rffi.charpsize2str(src, length)
+ finally:
+ lltype.free(inbuf, flavor='raw')
def multibytecodec_encerror(encodebuf, e, errors,
errorcb, namecb, unicodedata):
@@ -256,21 +260,16 @@
elif errors == "replace":
codec = pypy_cjk_enc_getcodec(encodebuf)
try:
- replace = encode(codec, u"?")
+ replace = encode(codec, "?", 1)
except EncodeDecodeError:
replace = "?"
else:
assert errorcb
- XXX
- retu, rets, end = errorcb(errors, namecb, reason,
- unicodedata.encode("utf8"), start, end)
- if rets is not None:
- # py3k only
- replace = rets
- else:
- assert retu is not None
- codec = pypy_cjk_enc_getcodec(encodebuf)
- replace = encode(codec, retu, "strict", errorcb, namecb)
+ rets, end = errorcb(errors, namecb, reason,
+ unicodedata, start, end)
+ codec = pypy_cjk_enc_getcodec(encodebuf)
+ lgt, _ = rutf8.get_utf8_length_flag(rets)
+ replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,4 +1,5 @@
from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib import rutf8
from pypy.module._multibytecodec import c_codecs
from pypy.module._multibytecodec.interp_multibytecodec import (
MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
@@ -65,7 +66,8 @@
pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- return space.newunicode(output)
+ lgt, flag = rutf8.get_utf8_length_flag(output)
+ return space.newutf8(output, lgt, flag)
@unwrap_spec(errors="text_or_none")
@@ -88,7 +90,8 @@
def _initialize(self):
self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
- self.pending = u""
+ self.pending = ""
+ self.pending_len = 0
def _free(self):
self.pending = None
@@ -96,25 +99,37 @@
c_codecs.pypy_cjk_enc_free(self.encodebuf)
self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
- @unwrap_spec(object='utf8', final=bool)
- def encode_w(self, object, final=False):
- u_object = object.decode('utf8')
+ @unwrap_spec(final=bool)
+ def encode_w(self, space, w_object, final=False):
+ utf8data, length = space.utf8_len_w(w_object)
space = self.space
state = space.fromcache(CodecState)
if len(self.pending) > 0:
- u_object = self.pending + u_object
+ utf8data = self.pending + utf8data
+ length += self.pending_len
try:
- output = c_codecs.encodeex(self.encodebuf, u_object, self.errors,
+ output = c_codecs.encodeex(self.encodebuf, utf8data, length,
+ self.errors,
state.encode_error_handler, self.name,
get_ignore_error(final))
except c_codecs.EncodeDecodeError as e:
- raise wrap_unicodeencodeerror(space, e, object, len(u_object),
+ raise wrap_unicodeencodeerror(space, e, utf8data, length,
self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
- assert 0 <= pos <= len(u_object)
- self.pending = u_object[pos:]
+ assert 0 <= pos <= length
+ # scan the utf8 string until we hit pos
+ i = 0
+ stop = length - pos
+ self.pending_len = stop
+ if stop > 0:
+ while pos > 0:
+ i = rutf8.next_codepoint_pos(utf8data, i)
+ pos -= 1
+ self.pending = utf8data[i:]
+ else:
+ self.pending = ""
return space.newbytes(output)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -31,23 +31,23 @@
return space.newtuple([space.newutf8(utf8_output, lgt, flag),
space.newint(len(input))])
- @unwrap_spec(input='utf8', errors="text_or_none")
- def encode(self, space, input, errors=None):
+ @unwrap_spec(errors="text_or_none")
+ def encode(self, space, w_input, errors=None):
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
+ input, length = space.utf8_len_w(w_input)
#
- u_input = input.decode('utf8')
try:
- output = c_codecs.encode(self.codec, u_input, errors,
+ output = c_codecs.encode(self.codec, input, length, errors,
state.encode_error_handler, self.name)
except c_codecs.EncodeDecodeError as e:
- raise wrap_unicodeencodeerror(space, e, input, len(u_input),
+ raise wrap_unicodeencodeerror(space, e, input, length,
self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
return space.newtuple([space.newbytes(output),
- space.newint(len(u_input))])
+ space.newint(length)])
MultibyteCodec.typedef = TypeDef(
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -14,27 +14,27 @@
def test_decode_gbk():
c = getcodec("gbk")
u = decode(c, "\xA1\xAA")
- assert u == unichr(0x2014)
+ assert u == unichr(0x2014).encode('utf8')
u = decode(c, "foobar")
- assert u == u"foobar"
+ assert u == "foobar"
def test_decode_hz():
# stateful
c = getcodec("hz")
u = decode(c, "~{abc}")
- assert u == u'\u5f95\u6cef'
+ assert u == u'\u5f95\u6cef'.encode('utf8')
u = decode(c, "~{")
- assert u == u''
+ assert u == ''
def test_decodeex_hz():
c = getcodec("hz")
decodebuf = c_codecs.pypy_cjk_dec_new(c)
u = c_codecs.decodeex(decodebuf, "~{abcd~}")
- assert u == u'\u5f95\u6c85'
+ assert u == u'\u5f95\u6c85'.encode('utf8')
u = c_codecs.decodeex(decodebuf, "~{efgh~}")
- assert u == u'\u5f50\u73b7'
+ assert u == u'\u5f50\u73b7'.encode('utf8')
u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
- assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+ assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'.encode('utf8')
c_codecs.pypy_cjk_dec_free(decodebuf)
def test_decodeex_hz_incomplete():
@@ -64,7 +64,7 @@
buf += c
u = c_codecs.decodeex(decodebuf, buf,
ignore_error = c_codecs.MBERR_TOOFEW)
- assert u == output
+ assert u == output.encode('utf8')
incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
buf = buf[incompletepos:]
assert buf == ''
@@ -86,46 +86,47 @@
def test_decode_hz_ignore():
c = getcodec("hz")
u = decode(c, 'def~{}abc', 'ignore')
- assert u == u'def\u5fcf'
+ assert u == u'def\u5fcf'.encode('utf8')
def test_decode_hz_replace():
c = getcodec("hz")
u = decode(c, 'def~{}abc', 'replace')
- assert u == u'def\ufffd\u5fcf'
+ assert u == u'def\ufffd\u5fcf'.encode('utf8')
def test_encode_hz():
c = getcodec("hz")
- s = encode(c, u'foobar')
+ s = encode(c, u'foobar'.encode('utf8'), 6)
assert s == 'foobar' and type(s) is str
- s = encode(c, u'\u5f95\u6cef')
+ s = encode(c, u'\u5f95\u6cef'.encode('utf8'), 2)
assert s == '~{abc}~}'
def test_encode_hz_error():
# error
c = getcodec("hz")
- e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+ e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def'.encode('utf8'), 7).value
assert e.start == 3
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
def test_encode_hz_ignore():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'ignore')
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'ignore')
assert s == 'abcdef'
def test_encode_hz_replace():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'replace')
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'replace')
assert s == 'abc?def'
def test_encode_jisx0208():
c = getcodec('iso2022_jp')
- s = encode(c, u'\u83ca\u5730\u6642\u592b')
+ s = encode(c, u'\u83ca\u5730\u6642\u592b'.encode('utf8'), 4)
assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
def test_encode_custom_error_handler_bytes():
+ py.test.skip("needs revamping in py3k")
c = getcodec("hz")
def errorhandler(errors, enc, msg, t, startingpos, endingpos):
- return None, '\xc3', endingpos
- s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+ return u'\xc3'.encode('utf8'), endingpos
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler)
assert '\xc3' in s
diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -1,6 +1,7 @@
from pypy.module._multibytecodec import c_codecs
from rpython.translator.c.test import test_standalone
from rpython.config.translationoption import get_combined_translation_config
+from rpython.rlib import rutf8
class TestTranslation(test_standalone.StandaloneTests):
@@ -13,7 +14,8 @@
codecname, string = argv[1], argv[2]
c = c_codecs.getcodec(codecname)
u = c_codecs.decode(c, string)
- r = c_codecs.encode(c, u)
+ lgt, _ = rutf8.get_utf8_length_flag(u)
+ r = c_codecs.encode(c, u, lgt)
print r
return 0
#
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,7 +1,7 @@
import sys
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.objectmodel import specialize, always_inline, r_dict
-from rpython.rlib import rfloat, runicode
+from rpython.rlib import rfloat, runicode, rutf8
from rpython.rtyper.lltypesystem import lltype, rffi
from pypy.interpreter.error import oefmt
from pypy.interpreter import unicodehelper
@@ -19,29 +19,6 @@
return 0.0
return x * NEG_POW_10[exp]
-def strslice2unicode_latin1(s, start, end):
- """
- Convert s[start:end] to unicode. s is supposed to be an RPython string
- encoded in latin-1, which means that the numeric value of each char is the
- same as the corresponding unicode code point.
-
- Internally it's implemented at the level of low-level helpers, to avoid
- the extra copy we would need if we take the actual slice first.
-
- No bound checking is done, use carefully.
- """
- from rpython.rtyper.annlowlevel import llstr, hlunicode
- from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
- from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
- length = end-start
- ll_s = llstr(s)
- ll_res = malloc(UNICODE, length)
- ll_res.hash = 0
- for i in range(length):
- ch = ll_s.chars[start+i]
- ll_res.chars[i] = cast_primitive(UniChar, ch)
- return hlunicode(ll_res)
-
def slice_eq(a, b):
(ll_chars1, start1, length1, _) = a
(ll_chars2, start2, length2, _) = b
@@ -270,10 +247,11 @@
self.pos = i+1
return self.space.newdict()
- d = {}
+ # XXX this should be improved to use an unwrapped dict
+ w_dict = self.space.newdict()
while True:
# parse a key: value
- name = self.decode_key(i)
+ w_name = self.decode_key(i)
i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
if ch != ':':
@@ -282,13 +260,13 @@
i = self.skip_whitespace(i)
#
w_value = self.decode_any(i)
- d[name] = w_value
+ self.space.setitem(w_dict, w_name, w_value)
i = self.skip_whitespace(self.pos)
ch = self.ll_chars[i]
i += 1
if ch == '}':
self.pos = i
- return self._create_dict(d)
+ return w_dict
elif ch == ',':
pass
elif ch == '\0':
@@ -297,10 +275,6 @@
self._raise("Unexpected '%s' when decoding object (char %d)",
ch, i-1)
- def _create_dict(self, d):
- from pypy.objspace.std.dictmultiobject import from_unicode_key_dict
- return from_unicode_key_dict(self.space, d)
-
def decode_string(self, i):
start = i
bits = 0
@@ -312,8 +286,7 @@
bits |= ord(ch)
if ch == '"':
self.pos = i
- return self.space.newunicode(
- self._create_string(start, i - 1, bits))
+ return self._create_string(start, i - 1, bits)
elif ch == '\\' or ch < '\x20':
self.pos = i-1
return self.decode_string_escaped(start)
@@ -322,12 +295,15 @@
if bits & 0x80:
# the 8th bit is set, it's an utf8 string
content_utf8 = self.getslice(start, end)
- return unicodehelper.decode_utf8(self.space, content_utf8)
+ lgt, flag = unicodehelper.check_utf8_or_raise(self.space,
+ content_utf8)
+ return self.space.newutf8(content_utf8, lgt, flag)
else:
# ascii only, fast path (ascii is a strict subset of
# latin1, and we already checked that all the chars are <
# 128)
- return strslice2unicode_latin1(self.s, start, end)
+ return self.space.newutf8(self.getslice(start, end),
+ end - start, rutf8.FLAG_ASCII)
def decode_string_escaped(self, start):
i = self.pos
@@ -340,9 +316,10 @@
i += 1
if ch == '"':
content_utf8 = builder.build()
- content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
+ lgt, f = unicodehelper.check_utf8_or_raise(self.space,
+ content_utf8)
self.pos = i
- return self.space.newunicode(content_unicode)
+ return self.space.newutf8(content_utf8, lgt, f)
elif ch == '\\':
i = self.decode_escape_sequence(i, builder)
elif ch < '\x20':
@@ -389,8 +366,7 @@
return # help the annotator to know that we'll never go beyond
# this point
#
- uchr = runicode.code_to_unichr(val) # may be a surrogate pair again
- utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
+ utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
builder.append(utf8_ch)
return i
@@ -404,7 +380,7 @@
return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
def decode_key(self, i):
- """ returns an unwrapped unicode """
+ """ returns a wrapped unicode """
from rpython.rlib.rarithmetic import intmask
i = self.skip_whitespace(i)
diff --git a/pypy/module/_pypyjson/interp_encoder.py b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -1,5 +1,5 @@
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.runicode import str_decode_utf_8
+from rpython.rlib import rutf8
from pypy.interpreter import unicodehelper
@@ -30,11 +30,8 @@
# the input is a string with only non-special ascii chars
return w_string
- eh = unicodehelper.decode_error_handler(space)
- u = str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0]
- sb = StringBuilder(len(u))
+ unicodehelper.check_utf8_or_raise(space, s)
+ sb = StringBuilder(len(s))
sb.append_slice(s, 0, first)
else:
# We used to check if 'u' contains only safe characters, and return
@@ -44,29 +41,31 @@
# a string (with the ascii encoding). This requires two passes
# over the characters. So we may as well directly turn it into a
# string here --- only one pass.
- u = space.unicode_w(w_string)
- sb = StringBuilder(len(u))
+ s = space.utf8_w(w_string)
+ sb = StringBuilder(len(s))
first = 0
- for i in range(first, len(u)):
- c = u[i]
- if c <= u'~':
- if c == u'"' or c == u'\\':
+ it = rutf8.Utf8StringIterator(s)
+ for i in range(first):
+ it.next()
+ for c in it:
+ if c <= ord('~'):
+ if c == ord('"') or c == ord('\\'):
sb.append('\\')
- elif c < u' ':
- sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+ elif c < ord(' '):
+ sb.append(ESCAPE_BEFORE_SPACE[c])
continue
- sb.append(chr(ord(c)))
+ sb.append(chr(c))
else:
- if c <= u'\uffff':
+ if c <= ord(u'\uffff'):
sb.append('\\u')
- sb.append(HEX[ord(c) >> 12])
- sb.append(HEX[(ord(c) >> 8) & 0x0f])
- sb.append(HEX[(ord(c) >> 4) & 0x0f])
- sb.append(HEX[ord(c) & 0x0f])
+ sb.append(HEX[c >> 12])
+ sb.append(HEX[(c >> 8) & 0x0f])
+ sb.append(HEX[(c >> 4) & 0x0f])
+ sb.append(HEX[c & 0x0f])
else:
# surrogate pair
- n = ord(c) - 0x10000
+ n = c - 0x10000
s1 = 0xd800 | ((n >> 10) & 0x3ff)
sb.append('\\ud')
sb.append(HEX[(s1 >> 8) & 0x0f])
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -10,10 +10,14 @@
assert dec.skip_whitespace(8) == len(s)
dec.close()
+class FakeSpace(object):
+ def newutf8(self, s, l, f):
+ return s
+
def test_decode_key():
s1 = "123" * 100
s = ' "%s" "%s" ' % (s1, s1)
- dec = JSONDecoder('fake space', s)
+ dec = JSONDecoder(FakeSpace(), s)
assert dec.pos == 0
x = dec.decode_key(0)
assert x == s1
diff --git a/pypy/module/_rawffi/alt/interp_funcptr.py b/pypy/module/_rawffi/alt/interp_funcptr.py
--- a/pypy/module/_rawffi/alt/interp_funcptr.py
+++ b/pypy/module/_rawffi/alt/interp_funcptr.py
@@ -167,8 +167,8 @@
addr = rffi.cast(rffi.ULONG, buf)
self.argchain.arg(addr)
- def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
- buf = rffi.unicode2wcharp(unicodeval)
+ def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len):
+ buf = rffi.utf82wcharp(utf8val, utf8len)
self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf))
addr = rffi.cast(rffi.ULONG, buf)
self.argchain.arg(addr)
diff --git a/pypy/module/_rawffi/alt/test/test_type_converter.py b/pypy/module/_rawffi/alt/test/test_type_converter.py
--- a/pypy/module/_rawffi/alt/test/test_type_converter.py
+++ b/pypy/module/_rawffi/alt/test/test_type_converter.py
@@ -6,7 +6,7 @@
class DummyFromAppLevelConverter(FromAppLevelConverter):
- def handle_all(self, w_ffitype, w_obj, val):
+ def handle_all(self, w_ffitype, w_obj, val, lgt=None):
self.lastval = val
handle_signed = handle_all
@@ -120,8 +120,8 @@
def test_strings(self):
# first, try automatic conversion from applevel
self.check(app_types.char_p, self.space.newbytes('foo'), 'foo')
- self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234')
- self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo')
+ self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234'.encode('utf8'))
+ self.check(app_types.unichar_p, self.space.wrap('foo'), 'foo')
# then, try to pass explicit pointers
self.check(app_types.char_p, self.space.wrap(42), 42)
self.check(app_types.unichar_p, self.space.wrap(42), 42)
diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -1,6 +1,6 @@
from rpython.rlib import libffi
-from rpython.rlib import jit
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib import jit, rutf8
+from rpython.rlib.rarithmetic import r_uint, intmask
from pypy.interpreter.error import oefmt
from pypy.module._rawffi.structure import W_StructureInstance, W_Structure
from pypy.module._rawffi.alt.interp_ffitype import app_types
@@ -85,8 +85,8 @@
return True
elif w_ffitype.is_unichar_p() and (w_type is self.space.w_bytes or
w_type is self.space.w_unicode):
- unicodeval = self.space.unicode_w(w_obj)
- self.handle_unichar_p(w_ffitype, w_obj, unicodeval)
+ utf8, lgt = self.space.utf8_len_w(w_obj)
+ self.handle_unichar_p(w_ffitype, w_obj, utf8, lgt)
return True
return False
@@ -147,7 +147,7 @@
"""
self.error(w_ffitype, w_obj)
- def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
+ def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len):
"""
unicodeval: interp-level unicode
"""
@@ -228,7 +228,8 @@
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
wcharval = self.get_unichar(w_ffitype)
- return space.newunicode(unichr(wcharval))
+ return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1,
+ rutf8.get_flag_from_code(intmask(wcharval)))
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -10,6 +10,7 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rtyper.tool import rffi_platform
from rpython.rlib.unroll import unrolling_iterable
+from rpython.rlib import rutf8
from rpython.rlib.objectmodel import specialize
import rpython.rlib.rposix as rposix
@@ -416,13 +417,13 @@
val = s[0]
push_func(add_arg, argdesc, val)
elif letter == 'u':
- s = space.unicode_w(w_arg)
- if len(s) != 1:
+ s, lgt = space.utf8_len_w(w_arg)
+ if lgt != 1:
raise oefmt(space.w_TypeError,
"Expected unicode string of length one as wide "
"character")
- val = s[0]
- push_func(add_arg, argdesc, val)
+ val = rutf8.codepoint_at_pos(s, 0)
+ push_func(add_arg, argdesc, rffi.cast(rffi.WCHAR_T, val))
else:
for c in unroll_letters_for_numbers:
if letter == c:
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,7 +7,8 @@
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import Utf8StringBuilder
# ____________________________________________________________
#
@@ -237,8 +238,8 @@
filter_is_callable = True
else:
if space.isinstance_w(w_ptemplate, space.w_unicode):
- filter_as_unicode = space.unicode_w(w_ptemplate)
- literal = u'\\' not in filter_as_unicode
+ filter_as_unicode = space.utf8_w(w_ptemplate)
+ literal = '\\' not in filter_as_unicode
use_builder = (
space.isinstance_w(w_string, space.w_unicode) and literal)
else:
@@ -267,7 +268,7 @@
sublist_w = strbuilder = unicodebuilder = None
if use_builder:
if filter_as_unicode is not None:
- unicodebuilder = UnicodeBuilder(ctx.end)
+ unicodebuilder = Utf8StringBuilder(ctx.end)
else:
assert filter_as_string is not None
strbuilder = StringBuilder(ctx.end)
@@ -335,7 +336,9 @@
return space.newbytes(strbuilder.build()), n
else:
assert unicodebuilder is not None
- return space.newunicode(unicodebuilder.build()), n
+ return space.newutf8(unicodebuilder.build(),
+ unicodebuilder.get_length(),
+ unicodebuilder.get_flag()), n
else:
if space.isinstance_w(w_string, space.w_unicode):
w_emptystr = space.newunicode(u'')
diff --git a/pypy/module/_ssl/interp_ssl.py b/pypy/module/_ssl/interp_ssl.py
--- a/pypy/module/_ssl/interp_ssl.py
+++ b/pypy/module/_ssl/interp_ssl.py
@@ -1566,12 +1566,13 @@
cadata = space.bufferstr_w(w_cadata)
else:
ca_file_type = SSL_FILETYPE_PEM
- try:
- cadata = space.unicode_w(w_cadata).encode('ascii')
- except UnicodeEncodeError:
+ w_uni = space.convert_arg_to_w_unicode(w_cadata)
+ if not w_uni.is_ascii():
raise oefmt(space.w_TypeError,
"cadata should be a ASCII string or a "
"bytes-like object")
+ cadata = space.utf8_w(w_uni)
+
if cafile is None and capath is None and cadata is None:
raise oefmt(space.w_TypeError,
"cafile and capath cannot be both omitted")
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1257,12 +1257,6 @@
create_iterator_classes(UnicodeDictStrategy)
-def from_unicode_key_dict(space, d):
- strategy = space.fromcache(UnicodeDictStrategy)
- storage = strategy.erase(d)
- return W_DictObject(space, strategy, storage)
-
-
class IntDictStrategy(AbstractTypedStrategy, DictStrategy):
erase, unerase = rerased.new_erasing_pair("int")
erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,23 +367,10 @@
assert isinstance(utf8s, str)
return W_UnicodeObject(utf8s, length, flag)
- def new_from_utf8(self, utf8s):
- # XXX: kill me!
- assert isinstance(utf8s, str)
- length, flag = rutf8.check_utf8(utf8s, True)
- return W_UnicodeObject(utf8s, length, flag)
-
def newfilename(self, s):
assert isinstance(s, str) # on pypy3, this decodes the byte string
return W_BytesObject(s) # with the filesystem encoding
- def newunicode(self, unistr):
- # XXX: kill me!
- assert isinstance(unistr, unicode)
- utf8s = unistr.encode("utf-8")
- length, flag = rutf8.check_utf8(utf8s, True)
- return self.newutf8(utf8s, length, flag)
-
def type(self, w_obj):
jit.promote(w_obj.__class__)
return w_obj.getclass(self)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -64,6 +64,11 @@
# - malloced object, which means it has index, then
# _index_storage.flags determines the kind
+ @staticmethod
+ def from_utf8builder(builder):
+ return W_UnicodeObject(
+ builder.build(), builder.get_length(), builder.get_flag())
+
def __repr__(self):
"""representation for debugging purposes"""
return "%s(%r)" % (self.__class__.__name__, self._utf8)
@@ -344,57 +349,38 @@
return mod_format(space, w_values, self, do_unicode=True)
def descr_swapcase(self, space):
- selfvalue = self._utf8
- builder = StringBuilder(len(selfvalue))
- flag = self._get_flag()
- i = 0
- while i < len(selfvalue):
- ch = rutf8.codepoint_at_pos(selfvalue, i)
- i = rutf8.next_codepoint_pos(selfvalue, i)
+ input = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(input))
+ for ch in rutf8.Utf8StringIterator(input):
if unicodedb.isupper(ch):
ch = unicodedb.tolower(ch)
elif unicodedb.islower(ch):
ch = unicodedb.toupper(ch)
- if ch >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
- return W_UnicodeObject(builder.build(), self._length, flag)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
def descr_title(self, space):
if len(self._utf8) == 0:
return self
- utf8, flag = self.title_unicode(self._utf8)
- return W_UnicodeObject(utf8, self._len(), flag)
+ return self.title_unicode(self._utf8)
@jit.elidable
def title_unicode(self, value):
input = self._utf8
- builder = StringBuilder(len(input))
- i = 0
+ builder = rutf8.Utf8StringBuilder(len(input))
previous_is_cased = False
- flag = self._get_flag()
- while i < len(input):
- ch = rutf8.codepoint_at_pos(input, i)
- i = rutf8.next_codepoint_pos(input, i)
+ for ch in rutf8.Utf8StringIterator(input):
if not previous_is_cased:
ch = unicodedb.totitle(ch)
else:
ch = unicodedb.tolower(ch)
- if ch >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
+ builder.append_code(ch)
previous_is_cased = unicodedb.iscased(ch)
- return builder.build(), flag
+ return self.from_utf8builder(builder)
def descr_translate(self, space, w_table):
- input = self._utf8
- result = StringBuilder(len(input))
- result_length = 0
- flag = self._get_flag()
- i = 0
- while i < len(input):
- codepoint = rutf8.codepoint_at_pos(input, i)
- i = rutf8.next_codepoint_pos(input, i)
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for codepoint in rutf8.Utf8StringIterator(self._utf8):
try:
w_newval = space.getitem(w_table, space.newint(codepoint))
except OperationError as e:
@@ -406,24 +392,19 @@
elif space.isinstance_w(w_newval, space.w_int):
codepoint = space.int_w(w_newval)
elif isinstance(w_newval, W_UnicodeObject):
- result.append(w_newval._utf8)
- flag = rutf8.combine_flags(flag, w_newval._get_flag())
- result_length += w_newval._length
+ builder.append_utf8(
+ w_newval._utf8, w_newval._length, w_newval._get_flag())
continue
else:
raise oefmt(space.w_TypeError,
"character mapping must return integer, None "
"or unicode")
try:
- if codepoint >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- rutf8.unichr_as_utf8_append(result, codepoint,
- allow_surrogates=True)
- result_length += 1
+ builder.append_code(codepoint)
except ValueError:
raise oefmt(space.w_TypeError,
"character mapping must be in range(0x110000)")
- return W_UnicodeObject(result.build(), result_length, flag)
+ return self.from_utf8builder(builder)
def descr_find(self, space, w_sub, w_start=None, w_end=None):
w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
@@ -517,12 +498,6 @@
def _join_return_one(self, space, w_obj):
return space.is_w(space.type(w_obj), space.w_unicode)
- def _join_check_item(self, space, w_obj):
- if (space.isinstance_w(w_obj, space.w_bytes) or
- space.isinstance_w(w_obj, space.w_unicode)):
- return 0
- return 1
-
def descr_formatter_parser(self, space):
from pypy.objspace.std.newformat import unicode_template_formatter
tformat = unicode_template_formatter(space, space.utf8_w(self))
@@ -534,16 +509,11 @@
return tformat.formatter_field_name_split()
def descr_lower(self, space):
- builder = StringBuilder(len(self._utf8))
- pos = 0
- flag = self._get_flag()
- while pos < len(self._utf8):
- lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
- if lower >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True)
- pos = rutf8.next_codepoint_pos(self._utf8, pos)
- return W_UnicodeObject(builder.build(), self._len(), flag)
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for ch in rutf8.Utf8StringIterator(self._utf8):
+ lower = unicodedb.tolower(ch)
+ builder.append_code(lower)
+ return self.from_utf8builder(builder)
def descr_isdecimal(self, space):
return self._is_generic(space, '_isdecimal')
@@ -657,13 +627,11 @@
flag = self._get_flag()
for i in range(size):
w_s = list_w[i]
- check_item = self._join_check_item(space, w_s)
- if check_item == 1:
+ if not (space.isinstance_w(w_s, space.w_bytes) or
+ space.isinstance_w(w_s, space.w_unicode)):
raise oefmt(space.w_TypeError,
- "sequence item %d: expected string, %T found",
+ "sequence item %d: expected string or unicode, %T found",
i, w_s)
- elif check_item == 2:
- return self._join_autoconvert(space, list_w)
# XXX Maybe the extra copy here is okay? It was basically going to
# happen anyway, what with being placed into the builder
w_u = self.convert_arg_to_w_unicode(space, w_s)
@@ -711,18 +679,11 @@
return space.newlist(strs_w)
def descr_upper(self, space):
- value = self._utf8
- builder = StringBuilder(len(value))
- flag = self._get_flag()
- i = 0
- while i < len(value):
- uchar = rutf8.codepoint_at_pos(value, i)
- uchar = unicodedb.toupper(uchar)
- if uchar >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- i = rutf8.next_codepoint_pos(value, i)
- rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True)
- return W_UnicodeObject(builder.build(), self._length, flag)
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for ch in rutf8.Utf8StringIterator(self._utf8):
+ ch = unicodedb.toupper(ch)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
@unwrap_spec(width=int)
def descr_zfill(self, space, width):
@@ -826,22 +787,15 @@
if len(value) == 0:
return self._empty()
- flag = self._get_flag()
- builder = StringBuilder(len(value))
- uchar = rutf8.codepoint_at_pos(value, 0)
- i = rutf8.next_codepoint_pos(value, 0)
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ it = rutf8.Utf8StringIterator(self._utf8)
+ uchar = it.next()
ch = unicodedb.toupper(uchar)
- rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
- if ch >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- while i < len(value):
- uchar = rutf8.codepoint_at_pos(value, i)
- i = rutf8.next_codepoint_pos(value, i)
- ch = unicodedb.tolower(uchar)
- rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
- if ch >= 0x80:
- flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
- return W_UnicodeObject(builder.build(), self._len(), flag)
+ builder.append_code(ch)
+ for ch in it:
+ ch = unicodedb.tolower(ch)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
def descr_center(self, space, width, w_fillchar):
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
cffi>=1.4.0
-vmprof>=0.4.10 # required to parse log files in rvmprof tests
+
+# parse log files in rvmprof tests
+vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x
# hypothesis is used for test generation on untranslated tests
hypothesis
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -687,6 +687,11 @@
self._lgt += 1
unichr_as_utf8_append(self._s, code, True)
+ def append_utf8(self, utf8, length, flag):
+ self._flag = combine_flags(self._flag, flag)
+ self._lgt += length
+ self._s.append(utf8)
+
def build(self):
return self._s.build()
@@ -702,10 +707,12 @@
self._end = len(utf8s)
self._pos = 0
- def done(self):
- return self._pos == self._end
+ def __iter__(self):
+ return self
def next(self):
More information about the pypy-commit
mailing list