[pypy-commit] pypy default: Be less inefficient when we decode string literals:

Wed Sep 17 22:11:06 CEST 2014

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r73590:10fd806838e2
Date: 2014-09-17 22:01 +0200
http://bitbucket.org/pypy/pypy/changeset/10fd806838e2/

Log:	Be less inefficient when we decode string literals: use RPython
	unicode instead of the full codec machinery, just to write a \U
	escape sequence!

diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -83,12 +83,6 @@
     v = PyString_DecodeEscape(space, substr, 'strict', enc)
     return space.wrap(v)
 
-def hexbyte(val):
-    result = "%x" % val
-    if len(result) == 1:
-        result = "0" + result
-    return result
-
 def decode_unicode_utf8(space, s, ps, q):
     # ****The Python 2.7 version, producing UTF-32 escapes****
     # String is utf8-encoded, but 'unicode_escape' expects
@@ -108,15 +102,14 @@
                 # instead.
                 lis.append("u005c")
         if ord(s[ps]) & 0x80: # XXX inefficient
-            w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
-            rn = len(w)
-            assert rn % 4 == 0
-            for i in range(0, rn, 4):
-                lis.append('\\U')
-                lis.append(hexbyte(ord(w[i])))
-                lis.append(hexbyte(ord(w[i+1])))
-                lis.append(hexbyte(ord(w[i+2])))
-                lis.append(hexbyte(ord(w[i+3])))
+            w, ps = decode_utf8(space, s, ps, end)
+            for c in w:
+                # The equivalent of %08x, which is not supported by RPython.
+                # 7 zeroes are enough for the unicode range, and the
+                # result still fits in 32-bit.
+                hexa = hex(ord(c) + 0x10000000)
+                lis.append('\\U0')
+                lis.append(hexa[3:])  # Skip 0x and the leading 1
         else:
             lis.append(s[ps])
             ps += 1
@@ -136,7 +129,7 @@
             # note that the C code has a label here.
             # the logic is the same.
             if recode_encoding and ord(s[ps]) & 0x80:
-                w, ps = decode_utf8(space, s, ps, end, recode_encoding)
+                w, ps = decode_utf8_recode(space, s, ps, end, recode_encoding)
                 # Append bytes to output buffer.
                 builder.append(w)
             else:
@@ -222,15 +215,19 @@
             ch >= 'A' and ch <= 'F')
 
 
-def decode_utf8(space, s, ps, end, encoding):
+def decode_utf8(space, s, ps, end):
     assert ps >= 0
     pt = ps
     # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
     while ps < end and ord(s[ps]) & 0x80:
         ps += 1
-    w_u = space.wrap(unicodehelper.decode_utf8(space, s[pt:ps]))
-    w_v = unicodehelper.encode(space, w_u, encoding)
-    v = space.str_w(w_v)
+    u = unicodehelper.decode_utf8(space, s[pt:ps])
+    return u, ps
+
+def decode_utf8_recode(space, s, ps, end, recode_encoding):
+    u, ps = decode_utf8(space, s, ps, end)
+    w_v = unicodehelper.encode(space, space.wrap(u), recode_encoding)
+    v = space.bytes_w(w_v)
     return v, ps
 
 def raise_app_valueerror(space, msg):
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -5,6 +5,7 @@
 
 @specialize.memo()
 def decode_error_handler(space):
+    # Fast version of the "strict" errors handler.
     def raise_unicode_exception_decode(errors, encoding, msg, s,
                                        startingpos, endingpos):
         raise OperationError(space.w_UnicodeDecodeError,
@@ -17,6 +18,7 @@
 
 @specialize.memo()
 def encode_error_handler(space):
+    # Fast version of the "strict" errors handler.
     def raise_unicode_exception_encode(errors, encoding, msg, u,
                                        startingpos, endingpos):
         raise OperationError(space.w_UnicodeEncodeError,