[pypy-commit] pypy unicode-utf8: remove decode from repr function
fijal
pypy.commits at gmail.com
Thu Oct 26 09:47:52 EDT 2017
Author: fijal
Branch: unicode-utf8
Changeset: r92851:196f5e9026d4
Date: 2017-10-26 15:45 +0200
http://bitbucket.org/pypy/pypy/changeset/196f5e9026d4/
Log: remove decode from repr function
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -126,10 +126,6 @@
return True
@staticmethod
- def _op_val(space, w_other, strict=None):
- return W_UnicodeObject.convert_arg_to_w_unicode(space, w_other, strict)._utf8.decode('utf8')
-
- @staticmethod
def convert_arg_to_w_unicode(space, w_other, strict=None):
if isinstance(w_other, W_UnicodeObject):
return w_other
@@ -237,10 +233,7 @@
return w_newobj
def descr_repr(self, space):
- chars = self._utf8.decode('utf8')
- size = len(chars)
- s = _repr_function(chars, size, "strict")
- return space.newtext(s)
+ return space.newtext(_repr_function(self._utf8))
def descr_str(self, space):
return encode_object(space, self, None, None)
@@ -1752,5 +1745,5 @@
return ''.join(result)
-_repr_function, _ = make_unicode_escape_function(
- pass_printable=False, unicode_output=False, quotes=True, prefix='u')
+_repr_function = rutf8.make_utf8_escape_function(
+ pass_printable=False, quotes=True, prefix='u')
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -16,10 +16,11 @@
"""
import sys
-from rpython.rlib.objectmodel import enforceargs
+from rpython.rlib.objectmodel import enforceargs, we_are_translated
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import jit
from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.unicodedata import unicodedb
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -478,3 +479,100 @@
if index == 2:
bytepos = next_codepoint_pos(utf8, bytepos)
return codepoint_at_pos(utf8, bytepos)
+
+def make_utf8_escape_function(pass_printable=False, quotes=False, prefix=None):
+ @jit.elidable
+ def unicode_escape(s):
+ size = len(s)
+ result = StringBuilder(size)
+
+ if quotes:
+ if prefix:
+ result.append(prefix)
+ if s.find('\'') != -1 and s.find('\"') == -1:
+ quote = ord('\"')
+ result.append('"')
+ else:
+ quote = ord('\'')
+ result.append('\'')
+ else:
+ quote = 0
+
+ if size == 0:
+ return ''
+
+ pos = 0
+ while pos < size:
+ oc = codepoint_at_pos(s, pos)
+ ch = s[pos]
+
+ # Escape quotes
+ if quotes and (oc == quote or ch == '\\'):
+ result.append('\\')
+ next_pos = next_codepoint_pos(s, pos)
+ result.append_slice(s, pos, next_pos)
+ pos = next_pos
+ continue
+
+ # The following logic is enabled only if MAXUNICODE == 0xffff, or
+ # for testing on top of a host Python where sys.maxunicode == 0xffff
+ if (not we_are_translated() and sys.maxunicode == 0xFFFF and
+ 0xD800 <= oc < 0xDC00 and pos + 3 < size):
+ # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+ pos += 3
+ oc2 = codepoint_at_pos(s, pos)
+
+ if 0xDC00 <= oc2 <= 0xDFFF:
+ ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+ char_escape_helper(result, ucs)
+ pos += 3
+ continue
+ # Fall through: isolated surrogates are copied as-is
+ pos -= 3
+
+ # Map special whitespace to '\t', \n', '\r'
+ if ch == '\t':
+ result.append('\\t')
+ elif ch == '\n':
+ result.append('\\n')
+ elif ch == '\r':
+ result.append('\\r')
+ elif ch == '\\':
+ result.append('\\\\')
+
+ # Map non-printable or non-ascii to '\xhh' or '\uhhhh'
+ elif pass_printable and not (oc <= 0x10ffff and unicodedb.isprintable(oc)):
+ char_escape_helper(result, oc)
+ elif not pass_printable and (oc < 32 or oc >= 0x7F):
+ char_escape_helper(result, oc)
+
+ # Copy everything else as-is
+ else:
+ if oc < 128:
+ result.append(ch)
+ else:
+ next_pos = next_codepoint_pos(s, pos)
+ result.append_slice(s, pos, next_pos)
+ pos = next_codepoint_pos(s, pos)
+
+ if quotes:
+ result.append(chr(quote))
+ return result.build()
+
+ TABLE = '0123456789abcdef'
+
+ def char_escape_helper(result, char):
+ if char >= 0x10000 or char < 0:
+ result.append("\\U")
+ zeros = 8
+ elif char >= 0x100:
+ result.append("\\u")
+ zeros = 4
+ else:
+ result.append("\\x")
+ zeros = 2
+ for i in range(zeros-1, -1, -1):
+ result.append(TABLE[(char >> (4 * i)) & 0x0f])
+
+ return unicode_escape #, char_escape_helper
+
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -114,6 +114,13 @@
assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) ==
len(u[:i].encode('utf8')))
+repr_func = rutf8.make_utf8_escape_function(prefix='u', pass_printable=False,
+ quotes=True)
+
+ at given(strategies.text())
+def test_repr(u):
+ assert repr(u) == repr_func(u.encode('utf8'))
+
@given(strategies.lists(strategies.characters()))
def test_surrogate_in_utf8(unichars):
uni = u''.join(unichars).encode('utf-8')
More information about the pypy-commit
mailing list