[pypy-commit] pypy default: Be less inefficient when we decode string literals:
amauryfa
noreply at buildbot.pypy.org
Wed Sep 17 22:11:06 CEST 2014
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r73590:10fd806838e2
Date: 2014-09-17 22:01 +0200
http://bitbucket.org/pypy/pypy/changeset/10fd806838e2/
Log: Be less inefficient when we decode string literals: use RPython
unicode instead of the full codec machinery, just to write a \U
escape sequence!
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -83,12 +83,6 @@
v = PyString_DecodeEscape(space, substr, 'strict', enc)
return space.wrap(v)
-def hexbyte(val):
- result = "%x" % val
- if len(result) == 1:
- result = "0" + result
- return result
-
def decode_unicode_utf8(space, s, ps, q):
# ****The Python 2.7 version, producing UTF-32 escapes****
# String is utf8-encoded, but 'unicode_escape' expects
@@ -108,15 +102,14 @@
# instead.
lis.append("u005c")
if ord(s[ps]) & 0x80: # XXX inefficient
- w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
- rn = len(w)
- assert rn % 4 == 0
- for i in range(0, rn, 4):
- lis.append('\\U')
- lis.append(hexbyte(ord(w[i])))
- lis.append(hexbyte(ord(w[i+1])))
- lis.append(hexbyte(ord(w[i+2])))
- lis.append(hexbyte(ord(w[i+3])))
+ w, ps = decode_utf8(space, s, ps, end)
+ for c in w:
+ # The equivalent of %08x, which is not supported by RPython.
+ # 7 zeroes are enough for the unicode range, and the
+ # result still fits in 32-bit.
+ hexa = hex(ord(c) + 0x10000000)
+ lis.append('\\U0')
+ lis.append(hexa[3:]) # Skip 0x and the leading 1
else:
lis.append(s[ps])
ps += 1
@@ -136,7 +129,7 @@
# note that the C code has a label here.
# the logic is the same.
if recode_encoding and ord(s[ps]) & 0x80:
- w, ps = decode_utf8(space, s, ps, end, recode_encoding)
+ w, ps = decode_utf8_recode(space, s, ps, end, recode_encoding)
# Append bytes to output buffer.
builder.append(w)
else:
@@ -222,15 +215,19 @@
ch >= 'A' and ch <= 'F')
-def decode_utf8(space, s, ps, end, encoding):
+def decode_utf8(space, s, ps, end):
assert ps >= 0
pt = ps
# while (s < end && *s != '\\') s++; */ /* inefficient for u".."
while ps < end and ord(s[ps]) & 0x80:
ps += 1
- w_u = space.wrap(unicodehelper.decode_utf8(space, s[pt:ps]))
- w_v = unicodehelper.encode(space, w_u, encoding)
- v = space.str_w(w_v)
+ u = unicodehelper.decode_utf8(space, s[pt:ps])
+ return u, ps
+
+def decode_utf8_recode(space, s, ps, end, recode_encoding):
+ u, ps = decode_utf8(space, s, ps, end)
+ w_v = unicodehelper.encode(space, space.wrap(u), recode_encoding)
+ v = space.bytes_w(w_v)
return v, ps
def raise_app_valueerror(space, msg):
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -5,6 +5,7 @@
@specialize.memo()
def decode_error_handler(space):
+ # Fast version of the "strict" errors handler.
def raise_unicode_exception_decode(errors, encoding, msg, s,
startingpos, endingpos):
raise OperationError(space.w_UnicodeDecodeError,
@@ -17,6 +18,7 @@
@specialize.memo()
def encode_error_handler(space):
+ # Fast version of the "strict" errors handler.
def raise_unicode_exception_encode(errors, encoding, msg, u,
startingpos, endingpos):
raise OperationError(space.w_UnicodeEncodeError,
More information about the pypy-commit
mailing list