[pypy-commit] pypy default: Issue #2389: the custom error handler may return a 'pos' that is smaller

Thu Sep 1 06:36:52 EDT 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r86816:e9dd5882eed6
Date: 2016-09-01 12:23 +0200
http://bitbucket.org/pypy/pypy/changeset/e9dd5882eed6/

Log:	Issue #2389: the custom error handler may return a 'pos' that is
	smaller than 'size', in which case we need to continue looping

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -157,22 +157,26 @@
         if pos + n > size:
             if not final:
                 break
+            # argh, this obscure block of code is mostly a copy of
+            # what follows :-(
             charsleft = size - pos - 1 # either 0, 1, 2
-            # note: when we get the 'unexpected end of data' we don't care
-            # about the pos anymore and we just ignore the value
+            # note: when we get the 'unexpected end of data' we need
+            # to care about the pos returned; it can be lower than size,
+            # in case we need to continue running this loop
             if not charsleft:
                 # there's only the start byte and nothing else
                 r, pos = errorhandler(errors, 'utf8',
                                       'unexpected end of data',
                                       s, pos, pos+1)
                 result.append(r)
-                break
+                continue
             ordch2 = ord(s[pos+1])
             if n == 3:
                 # 3-bytes seq with only a continuation byte
                 if (ordch2>>6 != 0x2 or   # 0b10
-                    (ordch1 == 0xe0 and ordch2 < 0xa0)):
-                    # or (ordch1 == 0xed and ordch2 > 0x9f)
+                    (ordch1 == 0xe0 and ordch2 < 0xa0)
+                or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
+                    ):
                     # second byte invalid, take the first and continue
                     r, pos = errorhandler(errors, 'utf8',
                                           'invalid continuation byte',
@@ -185,7 +189,7 @@
                                       'unexpected end of data',
                                       s, pos, pos+2)
                     result.append(r)
-                    break
+                    continue
             elif n == 4:
                 # 4-bytes seq with 1 or 2 continuation bytes
                 if (ordch2>>6 != 0x2 or    # 0b10
@@ -210,7 +214,8 @@
                                       'unexpected end of data',
                                       s, pos, pos+charsleft+1)
                     result.append(r)
-                    break
+                    continue
+            raise AssertionError("unreachable")
 
         if n == 0:
             r, pos = errorhandler(errors, 'utf8',
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -289,6 +289,12 @@
     def setup_method(self, meth):
         self.decoder = self.getdecoder('utf-8')
 
+    def custom_replace(self, errors, encoding, msg, s, startingpos, endingpos):
+        assert errors == 'custom'
+        # returns FOO, but consumes only one character (not up to endingpos)
+        FOO = u'\u1234'
+        return FOO, startingpos + 1
+
     def to_bytestring(self, bytes):
         return ''.join(chr(int(c, 16)) for c in bytes.split())
 
@@ -309,6 +315,7 @@
         E.g. <80> is a continuation byte and can appear only after a start byte.
         """
         FFFD = u'\ufffd'
+        FOO = u'\u1234'
         for byte in '\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
             py.test.raises(UnicodeDecodeError, self.decoder, byte, 1, None, final=True)
             self.checkdecodeerror(byte, 'utf-8', 0, 1, addstuff=False,
@@ -320,6 +327,11 @@
             assert self.decoder(byte, 1, 'ignore', final=True) == (u'', 1)
             assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'ignore',
                         final=True) == (u'aaaabbbb', 9))
+            assert self.decoder(byte, 1, 'custom', final=True,
+                        errorhandler=self.custom_replace) == (FOO, 1)
+            assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'custom',
+                        final=True, errorhandler=self.custom_replace) ==
+                        (u'aaaa'+ FOO + u'bbbb', 9))
 
     def test_unexpected_end_of_data(self):
         """
@@ -343,6 +355,7 @@
             'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
         ]
         FFFD = u'\ufffd'
+        FOO = u'\u1234'
         for seq in sequences:
             seq = self.to_bytestring(seq)
             py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq),
@@ -358,6 +371,12 @@
                                 ) == (u'', len(seq))
             assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore',
                         final=True) == (u'aaaabbbb', len(seq) + 8))
+            assert (self.decoder(seq, len(seq), 'custom', final=True,
+                        errorhandler=self.custom_replace) == 
+                        (FOO * len(seq), len(seq)))
+            assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'custom',
+                        final=True, errorhandler=self.custom_replace) ==
+                        (u'aaaa'+ FOO * len(seq) + u'bbbb', len(seq) + 8))
 
     def test_invalid_cb_for_2bytes_seq(self):
         """