[pypy-commit] pypy unicode-utf8-py3: refactor unicode_to_decimal to use only utf8

Sat Feb 9 11:35:44 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95920:902af9e81bcc
Date: 2019-02-09 14:10 +0100
http://bitbucket.org/pypy/pypy/changeset/902af9e81bcc/

Log:	refactor unicode_to_decimal to use only utf8

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,24 +1,22 @@
 * find a better way to run "find" without creating the index storage, if one
   if one is not already readily available (understand cost now, improve after merge)
 * write the correct jit_elidable in _get_index_storage (Armin)
-* improve performance of splitlines
+* improve performance of splitlines (CF)
 * stop using runicode/unicode and move MAXUNICODE to rutf8 (Matti)
-* think about cost of utf8 list strategy (Armin and CF)
+* think about cost of utf8 list strategy (CF)
 * revisit why runicode import str_decode_utf_8_impl needed instead of runicode
   import str_decode_utf_8
 * revisit all places where we do utf8.decode('utf-8'), they should work
   directly with utf8 (can be converted via runicode.str_decode_utf_8 as well)
   - rutf8.utf8_encode_mbcs
   - unicodehelper.fsencode
-  - unicodehelper.unicode_to_decimal_w
   - _winreg.interp_winreg
 * remove 'assert not isinstance(*, unicode)
 * add a flag that prevents support for unicode in rpython and enable it in PyPy (CF, Armin)
 * remove asserts from _WIN32 paths in rlib.rposix.re{name,place}
 * convert all realunicode_w to unicode_w after we flush out all old uses of
   unicode_w
-* benchmark more (looks good so far)
-* Review all uses of W_Unicode.text_w, right now it is exactly W_Unicode.utf8_w. 
+* view all uses of W_Unicode.text_w, right now it is exactly W_Unicode.utf8_w. 
   It shoud only return valid utf8 (see 0be26dc39a59 which broke translation on
   win32 and failed tests on linux64). Then we can use it in places like
   _socket.interp_func.getaddrinfo instead of space.encode_unicode_object(w_port,
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1901,26 +1901,29 @@
 def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
-    value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8'))
-    # XXX this is the only place in the code that this funcion is called.
-    return unicodehelper.encode_utf8(space, value,
-                                     allow_surrogates=allow_surrogates)
-
-def _rpy_unicode_to_decimal_w(space, unistr):
-    # XXX rewrite this to accept a utf8 string and use a StringBuilder
-    result = [u'\0'] * len(unistr)
-    for i in xrange(len(unistr)):
-        uchr = ord(unistr[i])
+    utf8 = space.utf8_w(w_unistr)
+    lgt =  space.len_w(w_unistr) 
+    result = StringBuilder(lgt)
+    itr = rutf8.Utf8StringIterator(utf8)
+    for uchr in itr:
         if uchr > 127:
             if unicodedb.isspace(uchr):
-                result[i] = ' '
+                result.append(' ')
                 continue
             try:
                 uchr = ord(u'0') + unicodedb.decimal(uchr)
             except KeyError:
-                pass
-        result[i] = unichr(uchr)
-    return u''.join(result)
+                w_encoding = space.newtext('decimal')
+                pos = itr.get_pos()
+                w_start = space.newint(pos)
+                w_end = space.newint(pos+1)
+                w_reason = space.newtext('invalid decimal Unicode string')
+                raise OperationError(space.w_UnicodeEncodeError,
+                                     space.newtuple([w_encoding, w_unistr,
+                                                     w_start, w_end,
+                                                     w_reason]))
+        result.append(chr(uchr))
+    return result.build()
 
 @jit.elidable
 def g_encode_utf8(value):