[pypy-commit] pypy unicode-utf8: in progress

Mon Nov 20 05:53:44 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93089:e4a80363506c
Date: 2017-11-20 11:52 +0100
http://bitbucket.org/pypy/pypy/changeset/e4a80363506c/

Log:	in progress

diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,30 +1,35 @@
+from hypothesis import given, strategies
+
+from rpython.rlib import rutf8
+
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter import unicodehelper as uh
 
 def decode_utf8(u):
     return str_decode_utf8(u, True, "strict", None)
 
 def test_decode_utf8():
-    assert decode_utf8("abc") == ("abc", 3)
-    assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 1)
-    assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 1)
-    assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 1)
+    assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII)
+    assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, rutf8.FLAG_REGULAR)
+    assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES)
+    assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES)
     assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
-        "\xed\xa0\x80\xed\xb0\x80", 2)
-    assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 1)
+        "\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES)
+    assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, rutf8.FLAG_REGULAR)
 
 def test_utf8_encode_ascii():
-    assert utf8_encode_ascii("abc", 3, "??", "??") == "abc"
+    assert utf8_encode_ascii("abc", "??", "??") == "abc"
     def eh(errors, encoding, reason, p, start, end):
         lst.append((errors, encoding, p, start, end))
         return "<FOO>", end
     lst = []
     input = u"\u1234".encode("utf8")
-    assert utf8_encode_ascii(input, 1, "??", eh) == "<FOO>"
+    assert utf8_encode_ascii(input, "??", eh) == "<FOO>"
     assert lst == [("??", "ascii", input, 0, 1)]
     lst = []
     input = u"\u1234\u5678abc\u8765\u4321".encode("utf8")
-    assert utf8_encode_ascii(input, 7, "??", eh) == "<FOO>abc<FOO>"
+    assert utf8_encode_ascii(input, "??", eh) == "<FOO>abc<FOO>"
     assert lst == [("??", "ascii", input, 0, 2),
                    ("??", "ascii", input, 5, 7)]
 
@@ -46,3 +51,7 @@
                    ("??", "ascii", input, 1, 2),
                    ("??", "ascii", input, 5, 6),
                    ("??", "ascii", input, 6, 7)]
+
+ at given(strategies.binary())
+def test_unicode_raw_escape(s):
+    uh.utf8_encode_raw_unicode_escape(s, 'strict')
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -158,7 +158,7 @@
                 res.append(chr(oc))
                 i += 1
             else:
-                r, pos = errorhandler(errors, 'latin1', 
+                r, pos = errorhandler(errors, 'latin1',
                                       'ordinal not in range(256)', s, cur,
                                       cur + 1)
                 res.append(r)
@@ -189,7 +189,7 @@
             res.append(r)
         else:
             res.append(chr(ch))
-            i = rutf8.next_codepoint_pos(utf8, i)    
+            i = rutf8.next_codepoint_pos(utf8, i)
             pos += 1
 
     s = res.build()
@@ -318,7 +318,7 @@
     assert pos - continuation_bytes >= 0
     r = res.build()
     lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos - continuation_bytes, lgt, flag
+    return r, pos, lgt, flag
 
 hexdigits = "0123456789ABCDEFabcdef"
 
@@ -362,7 +362,7 @@
                     flag = rutf8.FLAG_REGULAR
                 pos += digits
                 size = 1
-                
+
     return pos, size, flag
 
 def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):