[pypy-commit] pypy default: Issue #2389: the custom error handler may return a 'pos' that is smaller
arigo
pypy.commits at gmail.com
Thu Sep 1 06:36:52 EDT 2016
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r86816:e9dd5882eed6
Date: 2016-09-01 12:23 +0200
http://bitbucket.org/pypy/pypy/changeset/e9dd5882eed6/
Log: Issue #2389: the custom error handler may return a 'pos' that is
smaller than 'size', in which case we need to continue looping
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -157,22 +157,26 @@
if pos + n > size:
if not final:
break
+ # argh, this obscure block of code is mostly a copy of
+ # what follows :-(
charsleft = size - pos - 1 # either 0, 1, 2
- # note: when we get the 'unexpected end of data' we don't care
- # about the pos anymore and we just ignore the value
+ # note: when we get the 'unexpected end of data' we need
+ # to care about the pos returned; it can be lower than size,
+ # in case we need to continue running this loop
if not charsleft:
# there's only the start byte and nothing else
r, pos = errorhandler(errors, 'utf8',
'unexpected end of data',
s, pos, pos+1)
result.append(r)
- break
+ continue
ordch2 = ord(s[pos+1])
if n == 3:
# 3-bytes seq with only a continuation byte
if (ordch2>>6 != 0x2 or # 0b10
- (ordch1 == 0xe0 and ordch2 < 0xa0)):
- # or (ordch1 == 0xed and ordch2 > 0x9f)
+ (ordch1 == 0xe0 and ordch2 < 0xa0)
+ or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
+ ):
# second byte invalid, take the first and continue
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
@@ -185,7 +189,7 @@
'unexpected end of data',
s, pos, pos+2)
result.append(r)
- break
+ continue
elif n == 4:
# 4-bytes seq with 1 or 2 continuation bytes
if (ordch2>>6 != 0x2 or # 0b10
@@ -210,7 +214,8 @@
'unexpected end of data',
s, pos, pos+charsleft+1)
result.append(r)
- break
+ continue
+ raise AssertionError("unreachable")
if n == 0:
r, pos = errorhandler(errors, 'utf8',
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -289,6 +289,12 @@
def setup_method(self, meth):
self.decoder = self.getdecoder('utf-8')
+ def custom_replace(self, errors, encoding, msg, s, startingpos, endingpos):
+ assert errors == 'custom'
+ # returns FOO, but consumes only one character (not up to endingpos)
+ FOO = u'\u1234'
+ return FOO, startingpos + 1
+
def to_bytestring(self, bytes):
return ''.join(chr(int(c, 16)) for c in bytes.split())
@@ -309,6 +315,7 @@
E.g. <80> is a continuation byte and can appear only after a start byte.
"""
FFFD = u'\ufffd'
+ FOO = u'\u1234'
for byte in '\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
py.test.raises(UnicodeDecodeError, self.decoder, byte, 1, None, final=True)
self.checkdecodeerror(byte, 'utf-8', 0, 1, addstuff=False,
@@ -320,6 +327,11 @@
assert self.decoder(byte, 1, 'ignore', final=True) == (u'', 1)
assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'ignore',
final=True) == (u'aaaabbbb', 9))
+ assert self.decoder(byte, 1, 'custom', final=True,
+ errorhandler=self.custom_replace) == (FOO, 1)
+ assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'custom',
+ final=True, errorhandler=self.custom_replace) ==
+ (u'aaaa'+ FOO + u'bbbb', 9))
def test_unexpected_end_of_data(self):
"""
@@ -343,6 +355,7 @@
'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
]
FFFD = u'\ufffd'
+ FOO = u'\u1234'
for seq in sequences:
seq = self.to_bytestring(seq)
py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
@@ -358,6 +371,12 @@
) == (u'', len(seq))
assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
final=True) == (u'aaaabbbb', len(seq) + 8))
+ assert (self.decoder(seq, len(seq), 'custom', final=True,
+ errorhandler=self.custom_replace) ==
+ (FOO * len(seq), len(seq)))
+ assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'custom',
+ final=True, errorhandler=self.custom_replace) ==
+ (u'aaaa'+ FOO * len(seq) + u'bbbb', len(seq) + 8))
def test_invalid_cb_for_2bytes_seq(self):
"""
More information about the pypy-commit
mailing list