[pypy-commit] pypy unicode-utf8-py3: pos in encoding error handler is in unicode not utf8. refactor, use only utf8

Thu Nov 15 04:12:04 EST 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95318:83e8a364e912
Date: 2018-11-15 01:11 -0800
http://bitbucket.org/pypy/pypy/changeset/83e8a364e912/

Log:	pos in encoding error handler is in unicode not utf8. refactor, use
	only utf8

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -241,8 +241,9 @@
         pos = e.pos
         assert pos >= 0
         start = s[:pos]
+        upos = rutf8.codepoints_in_utf8(s, end=pos)
         ru, lgt = errorhandler(errors, 'utf8',
-                    'surrogates not allowed', s, pos, pos + 1)
+                    'surrogates not allowed', s, upos, upos + 1)
         end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
                                 allow_surrogates=allow_surrogates)
         s = start + ru + end
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -379,10 +379,11 @@
 def surrogatepass_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+        utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding')))
+        msg = space.text_w(space.getattr(w_exc, space.newtext('reason')))
         bytelength, code = get_standard_encoding(encoding)
         if code == ENC_UNKNOWN:
             # Not supported, fail with original exception
@@ -390,8 +391,12 @@
         end = space.int_w(w_end)
         builder = StringBuilder()
         pos = start
+        # start, end are in codepoint indices
+        itr = rutf8.Utf8StringIterator(utf8)
+        for i in range(pos):
+            itr.next()
         while pos < end:
-            ch = ord(obj[pos])
+            ch = itr.next()
             pos += 1
             if ch < 0xd800 or ch > 0xdfff:
                 # Not a surrogate, fail with original exception
@@ -466,8 +471,11 @@
         end = space.int_w(w_end)
         res = ''
         pos = start
+        itr = rutf8.Utf8StringIterator(utf8)
+        for i in range(pos):
+            itr.next()
         while pos < end:
-            ch = rutf8.codepoint_at_pos(utf8, pos)
+            ch = itr.next()
             pos += 1
             if ch < 0xdc80 or ch > 0xdcff:
                 # Not a UTF-8b surrogate, fail with original exception
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -648,6 +648,8 @@
         assert u'\ud8ae'.encode('utf_16_be', 'surrogatepass') == b'\xd8\xae'
         assert (u'\U0000d8ae'.encode('utf-32-be', 'surrogatepass') ==
                 b'\x00\x00\xd8\xae')
+        assert (u'\x80\ud800'.encode('utf8', 'surrogatepass') ==
+                b'\xc2\x80\xed\xa0\x80')
 
     def test_badandgoodsurrogatepassexceptions(self):
         import codecs
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1209,28 +1209,26 @@
         if errors is None:
             errors = 'strict'
         pos = rutf8.surrogate_in_utf8(utf8)
-        if pos >= 0:
-            handled_error = True
-        else:
-            handled_error = False
         state = space.fromcache(CodecState)
         eh = state.encode_error_handler
-        while pos >= 0:
-            start = utf8[:pos]
-            ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8,
-                pos, pos + 1)
-            upos = rutf8.next_codepoint_pos(utf8, _pos)
-            end = utf8[upos:]
-            utf8 = start + ru + end
-            _pos = rutf8.surrogate_in_utf8(utf8)
-            if _pos <= pos:
-                # surrogatepass?
-                break 
-            pos = _pos
-        if errors == 'surrogateescape' and handled_error:
-            #escape
-            return space.newbytes(utf8)
-        w_object = space.newtext(utf8)
+        if pos >= 0:
+            while pos >= 0:
+                start = utf8[:pos]
+                upos = rutf8.codepoints_in_utf8(utf8, end=pos)
+                ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8,
+                    upos, upos + 1)
+                upos = rutf8.next_codepoint_pos(utf8, _pos)
+                end = utf8[upos:]
+                utf8 = start + ru + end
+                _pos = rutf8.surrogate_in_utf8(utf8)
+                if _pos <= pos:
+                    # surrogatepass?
+                    break 
+                pos = _pos
+            if errors == 'surrogateescape':
+                #escape
+                return space.newbytes(utf8)
+            w_object = space.newtext(utf8)
     if errors is None or errors == 'strict':
         if encoding is None or encoding == 'utf-8':
             #if rutf8.has_surrogates(utf8):