[pypy-commit] pypy unicode-utf8: (fijal, arigo)
arigo
pypy.commits at gmail.com
Thu Aug 24 11:22:45 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92255:8e029544ca2f
Date: 2017-08-24 17:21 +0200
http://bitbucket.org/pypy/pypy/changeset/8e029544ca2f/
Log: (fijal, arigo)
Fix unicode.translate()
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -365,36 +365,39 @@
return mod_format(space, w_values, self, do_unicode=True)
def descr_translate(self, space, w_table):
- selfvalue = self._utf8.decode("utf8")
- w_sys = space.getbuiltinmodule('sys')
- maxunicode = space.int_w(space.getattr(w_sys,
- space.newtext("maxunicode")))
- result = []
- for unichar in selfvalue:
+ input = self._utf8
+ result = StringBuilder(len(input))
+ result_length = 0
+ i = 0
+ while i < len(input):
+ codepoint = rutf8.codepoint_at_pos(input, i)
+ i = rutf8.next_codepoint_pos(input, i)
try:
- w_newval = space.getitem(w_table, space.newint(ord(unichar)))
+ w_newval = space.getitem(w_table, space.newint(codepoint))
except OperationError as e:
- if e.match(space, space.w_LookupError):
- result.append(unichar)
- else:
+ if not e.match(space, space.w_LookupError):
raise
else:
if space.is_w(w_newval, space.w_None):
continue
elif space.isinstance_w(w_newval, space.w_int):
- newval = space.int_w(w_newval)
- if newval < 0 or newval > maxunicode:
- raise oefmt(space.w_TypeError,
- "character mapping must be in range(%s)",
- hex(maxunicode + 1))
- result.append(unichr(newval))
- elif space.isinstance_w(w_newval, space.w_unicode):
- result.append(space.utf8_w(w_newval).decode("utf8"))
+ codepoint = space.int_w(w_newval)
+ elif isinstance(w_newval, W_UnicodeObject):
+ result.append(w_newval._utf8)
+ result_length += w_newval._length
+ continue
else:
raise oefmt(space.w_TypeError,
"character mapping must return integer, None "
"or unicode")
- return W_UnicodeObject(u''.join(result).encode("utf8"), -1)
+ try:
+ rutf8.unichr_as_utf8_append(result, codepoint,
+ allow_surrogates=True)
+ result_length += 1
+ except ValueError:
+ raise oefmt(space.w_TypeError,
+ "character mapping must be in range(0x110000)")
+ return W_UnicodeObject(result.build(), result_length)
def descr_encode(self, space, w_encoding=None, w_errors=None):
encoding, errors = _get_encoding_and_errors(space, w_encoding,
More information about the pypy-commit
mailing list