[pypy-commit] pypy unicode-utf8: Handle errorhandlers that go backward

Sun Dec 10 00:17:32 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8
Changeset: r93342:a4d68881a89d
Date: 2017-12-10 05:16 +0000
http://bitbucket.org/pypy/pypy/changeset/a4d68881a89d/

Log:	Handle errorhandlers that go backward

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1101,19 +1101,16 @@
             ru, newindex = errorhandler(errors, public_encoding_name,
                                    'surrogates not allowed',
                                     s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR(result, ord(ch), byteorder)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp < 0xD800:
+                    _STORECHAR(result, cp, byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
                                  s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
 
         pos = rutf8.next_codepoint_pos(s, pos)
@@ -1282,22 +1279,19 @@
         ch = rutf8.codepoint_at_pos(s, pos)
         pos = rutf8.next_codepoint_pos(s, pos)
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            ru, newindex = errorhandler(errors, public_encoding_name,
+            res_8, newindex = errorhandler(errors, public_encoding_name,
                                         'surrogates not allowed',
                                         s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR32(result, ord(ch), byteorder)
+            for ch in rutf8.Utf8StringIterator(res_8):
+                if ch < 0xD800:
+                    _STORECHAR32(result, ch, byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
                                  s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         _STORECHAR32(result, ch, byteorder)
         index += 1
@@ -1459,9 +1453,7 @@
                 result.append(ch2)
             if index != newindex:  # Should be uncommon
                 index = newindex
-                pos = 0
-                for _ in range(newindex):
-                    pos = rutf8.next_codepoint_pos(s, pos)
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         result.append(c)
         index += 1
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -439,7 +439,7 @@
             low = codepoint_at_pos(utf8, i)
             if 0xDC00 <= low <= 0xDFFF:
                 uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
-                i = next_codepoint_pos(utf8, i)                
+                i = next_codepoint_pos(utf8, i)
             # else not really a surrogate pair, just append high
         else:
             i = next_codepoint_pos(utf8, i)
@@ -537,6 +537,13 @@
     else:
         return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
 
+def _pos_at_index(utf8, index):
+    # Slow!
+    pos = 0
+    for _ in range(index):
+        pos = next_codepoint_pos(utf8, pos)
+    return pos
+
 @jit.dont_look_inside
 def codepoint_at_index(utf8, storage, index):
     """ Return codepoint of a character inside utf8 encoded string, given