[pypy-commit] pypy unicode-utf8: in progress
fijal
pypy.commits at gmail.com
Mon Nov 20 05:53:44 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93089:e4a80363506c
Date: 2017-11-20 11:52 +0100
http://bitbucket.org/pypy/pypy/changeset/e4a80363506c/
Log: in progress
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,30 +1,35 @@
+from hypothesis import given, strategies
+
+from rpython.rlib import rutf8
+
from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter import unicodehelper as uh
def decode_utf8(u):
return str_decode_utf8(u, True, "strict", None)
def test_decode_utf8():
- assert decode_utf8("abc") == ("abc", 3)
- assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 1)
- assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 1)
- assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 1)
+ assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII)
+ assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, rutf8.FLAG_REGULAR)
+ assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES)
+ assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES)
assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
- "\xed\xa0\x80\xed\xb0\x80", 2)
- assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 1)
+ "\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES)
+ assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, rutf8.FLAG_REGULAR)
def test_utf8_encode_ascii():
- assert utf8_encode_ascii("abc", 3, "??", "??") == "abc"
+ assert utf8_encode_ascii("abc", "??", "??") == "abc"
def eh(errors, encoding, reason, p, start, end):
lst.append((errors, encoding, p, start, end))
return "<FOO>", end
lst = []
input = u"\u1234".encode("utf8")
- assert utf8_encode_ascii(input, 1, "??", eh) == "<FOO>"
+ assert utf8_encode_ascii(input, "??", eh) == "<FOO>"
assert lst == [("??", "ascii", input, 0, 1)]
lst = []
input = u"\u1234\u5678abc\u8765\u4321".encode("utf8")
- assert utf8_encode_ascii(input, 7, "??", eh) == "<FOO>abc<FOO>"
+ assert utf8_encode_ascii(input, "??", eh) == "<FOO>abc<FOO>"
assert lst == [("??", "ascii", input, 0, 2),
("??", "ascii", input, 5, 7)]
@@ -46,3 +51,7 @@
("??", "ascii", input, 1, 2),
("??", "ascii", input, 5, 6),
("??", "ascii", input, 6, 7)]
+
+ at given(strategies.binary())
+def test_unicode_raw_escape(s):
+ uh.utf8_encode_raw_unicode_escape(s, 'strict')
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -158,7 +158,7 @@
res.append(chr(oc))
i += 1
else:
- r, pos = errorhandler(errors, 'latin1',
+ r, pos = errorhandler(errors, 'latin1',
'ordinal not in range(256)', s, cur,
cur + 1)
res.append(r)
@@ -189,7 +189,7 @@
res.append(r)
else:
res.append(chr(ch))
- i = rutf8.next_codepoint_pos(utf8, i)
+ i = rutf8.next_codepoint_pos(utf8, i)
pos += 1
s = res.build()
@@ -318,7 +318,7 @@
assert pos - continuation_bytes >= 0
r = res.build()
lgt, flag = rutf8.check_utf8(r, True)
- return r, pos - continuation_bytes, lgt, flag
+ return r, pos, lgt, flag
hexdigits = "0123456789ABCDEFabcdef"
@@ -362,7 +362,7 @@
flag = rutf8.FLAG_REGULAR
pos += digits
size = 1
-
+
return pos, size, flag
def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
More information about the pypy-commit
mailing list