[pypy-commit] pypy unicode-utf8: Tweak the unicode FLAG_xx values for performance; collapse two identical helpers; move combine_flags() to rutf8
arigo
pypy.commits at gmail.com
Thu Nov 23 04:27:49 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93133:a1cf21d7a124
Date: 2017-11-23 10:24 +0100
http://bitbucket.org/pypy/pypy/changeset/a1cf21d7a124/
Log: Tweak the unicode FLAG_xx values for performance; collapse two
identical helpers; move combine_flags() to rutf8
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,6 +3,7 @@
from pypy.interpreter.error import OperationError
from rpython.rlib.objectmodel import specialize
from rpython.rlib import rutf8
+from rpython.rlib.rutf8 import combine_flags
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.rstring import StringBuilder
from pypy.module._codecs import interp_codecs
@@ -43,14 +44,6 @@
from pypy.objspace.std.unicodeobject import encode_object
return encode_object(space, w_data, encoding, errors)
-def combine_flags(one, two):
- if one == rutf8.FLAG_ASCII and two == rutf8.FLAG_ASCII:
- return rutf8.FLAG_ASCII
- elif (one == rutf8.FLAG_HAS_SURROGATES or
- two == rutf8.FLAG_HAS_SURROGATES):
- return rutf8.FLAG_HAS_SURROGATES
- return rutf8.FLAG_REGULAR
-
def _has_surrogate(u):
for c in u:
@@ -788,7 +781,8 @@
# first surrogate
surrogate = outCh
else:
- flag = combine_flags(flag, rutf8.unichr_to_flag(outCh))
+ flag = combine_flags(flag,
+ rutf8.get_flag_from_code(outCh))
outsize += 1
assert outCh >= 0
rutf8.unichr_as_utf8_append(result, outCh, True)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -356,7 +356,7 @@
elif unicodedb.islower(ch):
ch = unicodedb.toupper(ch)
if ch >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(builder, ch)
return W_UnicodeObject(builder.build(), self._length, flag)
@@ -381,7 +381,7 @@
else:
ch = unicodedb.tolower(ch)
if ch >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(builder, ch)
previous_is_cased = unicodedb.iscased(ch)
return builder.build(), flag
@@ -407,7 +407,7 @@
codepoint = space.int_w(w_newval)
elif isinstance(w_newval, W_UnicodeObject):
result.append(w_newval._utf8)
- flag = unicodehelper.combine_flags(flag, w_newval._get_flag())
+ flag = rutf8.combine_flags(flag, w_newval._get_flag())
result_length += w_newval._length
continue
else:
@@ -416,7 +416,7 @@
"or unicode")
try:
if codepoint >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(result, codepoint,
allow_surrogates=True)
result_length += 1
@@ -540,7 +540,7 @@
while pos < len(self._utf8):
lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
if lower >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
pos = rutf8.next_codepoint_pos(self._utf8, pos)
return W_UnicodeObject(builder.build(), self._len(), flag)
@@ -642,7 +642,7 @@
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
raise
- flag = unicodehelper.combine_flags(self._get_flag(), w_other._get_flag())
+ flag = rutf8.combine_flags(self._get_flag(), w_other._get_flag())
return W_UnicodeObject(self._utf8 + w_other._utf8,
self._len() + w_other._len(), flag)
@@ -667,7 +667,7 @@
# XXX Maybe the extra copy here is okay? It was basically going to
# happen anyway, what with being placed into the builder
w_u = self.convert_arg_to_w_unicode(space, w_s)
- flag = unicodehelper.combine_flags(flag, w_u._get_flag())
+ flag = rutf8.combine_flags(flag, w_u._get_flag())
unwrapped.append(w_u._utf8)
lgt += w_u._length
prealloc_size += len(unwrapped[i])
@@ -719,7 +719,7 @@
uchar = rutf8.codepoint_at_pos(value, i)
uchar = unicodedb.toupper(uchar)
if uchar >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
i = rutf8.next_codepoint_pos(value, i)
rutf8.unichr_as_utf8_append(builder, uchar)
return W_UnicodeObject(builder.build(), self._length, flag)
@@ -833,14 +833,14 @@
ch = unicodedb.toupper(uchar)
rutf8.unichr_as_utf8_append(builder, ch)
if ch >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
while i < len(value):
uchar = rutf8.codepoint_at_pos(value, i)
i = rutf8.next_codepoint_pos(value, i)
ch = unicodedb.tolower(uchar)
rutf8.unichr_as_utf8_append(builder, ch)
if ch >= 0x80:
- flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+ flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
return W_UnicodeObject(builder.build(), self._len(), flag)
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
@@ -926,7 +926,7 @@
except OverflowError:
raise oefmt(space.w_OverflowError, "replace string is too long")
- flag = unicodehelper.combine_flags(self._get_flag(), w_by._get_flag())
+ flag = rutf8.combine_flags(self._get_flag(), w_by._get_flag())
newlength = self._length + replacements * (w_by._length - w_sub._length)
return W_UnicodeObject(res, newlength, flag)
@@ -1048,7 +1048,7 @@
if w_fillchar._len() != 1:
raise oefmt(space.w_TypeError,
"rjust() argument 2 must be a single character")
- flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
+ flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
d = width - lgt
if d > 0:
if len(w_fillchar._utf8) == 1:
@@ -1067,7 +1067,7 @@
if w_fillchar._len() != 1:
raise oefmt(space.w_TypeError,
"ljust() argument 2 must be a single character")
- flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
+ flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
d = width - self._len()
if d > 0:
if len(w_fillchar._utf8) == 1:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -50,6 +50,7 @@
def unichr_as_utf8_append(builder, code, allow_surrogates=False):
"""Encode code (numeric value) as utf8 encoded string
and emit the result into the given StringBuilder.
+ Raises ValueError if the code is outside range(0x110000).
"""
code = r_uint(code)
if code <= r_uint(0x7F):
@@ -124,13 +125,6 @@
continuation_bytes += 1
return len(s) - continuation_bytes
-def get_flag_from_code(oc):
- if oc <= 0x7F:
- return FLAG_ASCII
- if 0xD800 <= oc <= 0xDFFF:
- return FLAG_HAS_SURROGATES
- return FLAG_REGULAR
-
def codepoint_at_pos(code, pos):
""" Give a codepoint in code at pos - assumes valid utf8, no checking!
"""
@@ -453,22 +447,24 @@
UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc',
('flag', lltype.Signed),
- ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct(
- 'utf8_loc_elem',
- ('baseindex', lltype.Signed),
- ('ofs', lltype.FixedSizeArray(lltype.Char, 16)))
- ))))
+ ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct('utf8_loc_elem',
+ ('baseindex', lltype.Signed),
+ ('ofs', lltype.FixedSizeArray(lltype.Char, 16)),
+ )))))
-def unichr_to_flag(ch):
- if ch <= 0x7F:
+def get_flag_from_code(oc):
+ if oc <= 0x7F:
return FLAG_ASCII
- elif 0xD800 <= ch <= 0xDFFF:
+ if 0xD800 <= oc <= 0xDFFF:
return FLAG_HAS_SURROGATES
return FLAG_REGULAR
-FLAG_REGULAR = 0
-FLAG_HAS_SURROGATES = 1
-FLAG_ASCII = 2
+def combine_flags(one, two):
+ return one | two
+
+FLAG_ASCII = 0 # no bits
+FLAG_REGULAR = 1 # bit 0
+FLAG_HAS_SURROGATES = 3 # bit 0 and bit 1
# note that we never need index storage if we're pure ascii, but it's useful
# for passing into W_UnicodeObject.__init__
More information about the pypy-commit
mailing list