[pypy-commit] pypy unicode-utf8: Tweak the unicode FLAG_xx values for performance; collapse two identical helpers; move combine_flags() to rutf8

Thu Nov 23 04:27:49 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93133:a1cf21d7a124
Date: 2017-11-23 10:24 +0100
http://bitbucket.org/pypy/pypy/changeset/a1cf21d7a124/

Log:	Tweak the unicode FLAG_xx values for performance; collapse two
	identical helpers; move combine_flags() to rutf8

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,6 +3,7 @@
 from pypy.interpreter.error import OperationError
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import rutf8
+from rpython.rlib.rutf8 import combine_flags
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
 from pypy.module._codecs import interp_codecs
@@ -43,14 +44,6 @@
     from pypy.objspace.std.unicodeobject import encode_object
     return encode_object(space, w_data, encoding, errors)
 
-def combine_flags(one, two):
-    if one == rutf8.FLAG_ASCII and two == rutf8.FLAG_ASCII:
-        return rutf8.FLAG_ASCII
-    elif (one == rutf8.FLAG_HAS_SURROGATES or
-          two == rutf8.FLAG_HAS_SURROGATES):
-        return rutf8.FLAG_HAS_SURROGATES
-    return rutf8.FLAG_REGULAR
-
 
 def _has_surrogate(u):
     for c in u:
@@ -788,7 +781,8 @@
                         # first surrogate
                         surrogate = outCh
                     else:
-                        flag = combine_flags(flag, rutf8.unichr_to_flag(outCh))
+                        flag = combine_flags(flag,
+                                             rutf8.get_flag_from_code(outCh))
                         outsize += 1
                         assert outCh >= 0
                         rutf8.unichr_as_utf8_append(result, outCh, True)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -356,7 +356,7 @@
             elif unicodedb.islower(ch):
                 ch = unicodedb.toupper(ch)
             if ch >= 0x80:
-                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, ch)
         return W_UnicodeObject(builder.build(), self._length, flag)
 
@@ -381,7 +381,7 @@
             else:
                 ch = unicodedb.tolower(ch)
             if ch >= 0x80:
-                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, ch)
             previous_is_cased = unicodedb.iscased(ch)
         return builder.build(), flag
@@ -407,7 +407,7 @@
                     codepoint = space.int_w(w_newval)
                 elif isinstance(w_newval, W_UnicodeObject):
                     result.append(w_newval._utf8)
-                    flag = unicodehelper.combine_flags(flag, w_newval._get_flag())
+                    flag = rutf8.combine_flags(flag, w_newval._get_flag())
                     result_length += w_newval._length
                     continue
                 else:
@@ -416,7 +416,7 @@
                                 "or unicode")
             try:
                 if codepoint >= 0x80:
-                    flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                    flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
                 rutf8.unichr_as_utf8_append(result, codepoint,
                                             allow_surrogates=True)
                 result_length += 1
@@ -540,7 +540,7 @@
         while pos < len(self._utf8):
             lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
             if lower >= 0x80:
-                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
             rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
             pos = rutf8.next_codepoint_pos(self._utf8, pos)
         return W_UnicodeObject(builder.build(), self._len(), flag)
@@ -642,7 +642,7 @@
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
             raise
-        flag = unicodehelper.combine_flags(self._get_flag(), w_other._get_flag())
+        flag = rutf8.combine_flags(self._get_flag(), w_other._get_flag())
         return W_UnicodeObject(self._utf8 + w_other._utf8,
                                self._len() + w_other._len(), flag)
 
@@ -667,7 +667,7 @@
             # XXX Maybe the extra copy here is okay? It was basically going to
             #     happen anyway, what with being placed into the builder
             w_u = self.convert_arg_to_w_unicode(space, w_s)
-            flag = unicodehelper.combine_flags(flag, w_u._get_flag())
+            flag = rutf8.combine_flags(flag, w_u._get_flag())
             unwrapped.append(w_u._utf8)
             lgt += w_u._length
             prealloc_size += len(unwrapped[i])
@@ -719,7 +719,7 @@
             uchar = rutf8.codepoint_at_pos(value, i)
             uchar = unicodedb.toupper(uchar)
             if uchar >= 0x80:
-                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
             i = rutf8.next_codepoint_pos(value, i)
             rutf8.unichr_as_utf8_append(builder, uchar)
         return W_UnicodeObject(builder.build(), self._length, flag)
@@ -833,14 +833,14 @@
         ch = unicodedb.toupper(uchar)
         rutf8.unichr_as_utf8_append(builder, ch)
         if ch >= 0x80:
-            flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+            flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
         while i < len(value):
             uchar = rutf8.codepoint_at_pos(value, i)
             i = rutf8.next_codepoint_pos(value, i)
             ch = unicodedb.tolower(uchar)
             rutf8.unichr_as_utf8_append(builder, ch)
             if ch >= 0x80:
-                flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
         return W_UnicodeObject(builder.build(), self._len(), flag)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
@@ -926,7 +926,7 @@
         except OverflowError:
             raise oefmt(space.w_OverflowError, "replace string is too long")
 
-        flag = unicodehelper.combine_flags(self._get_flag(), w_by._get_flag())
+        flag = rutf8.combine_flags(self._get_flag(), w_by._get_flag())
         newlength = self._length + replacements * (w_by._length - w_sub._length)
         return W_UnicodeObject(res, newlength, flag)
 
@@ -1048,7 +1048,7 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "rjust() argument 2 must be a single character")
-        flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
+        flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - lgt
         if d > 0:
             if len(w_fillchar._utf8) == 1:
@@ -1067,7 +1067,7 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "ljust() argument 2 must be a single character")
-        flag = unicodehelper.combine_flags(self._get_flag(), w_fillchar._get_flag())
+        flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - self._len()
         if d > 0:
             if len(w_fillchar._utf8) == 1:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -50,6 +50,7 @@
 def unichr_as_utf8_append(builder, code, allow_surrogates=False):
     """Encode code (numeric value) as utf8 encoded string
     and emit the result into the given StringBuilder.
+    Raises ValueError if the code is outside range(0x110000).
     """
     code = r_uint(code)
     if code <= r_uint(0x7F):
@@ -124,13 +125,6 @@
             continuation_bytes += 1
     return len(s) - continuation_bytes
 
-def get_flag_from_code(oc):
-    if oc <= 0x7F:
-        return FLAG_ASCII
-    if 0xD800 <= oc <= 0xDFFF:
-        return FLAG_HAS_SURROGATES
-    return FLAG_REGULAR
-
 def codepoint_at_pos(code, pos):
     """ Give a codepoint in code at pos - assumes valid utf8, no checking!
     """
@@ -453,22 +447,24 @@
 
 UTF8_INDEX_STORAGE = lltype.GcStruct('utf8_loc',
     ('flag', lltype.Signed),
-    ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct(
-    'utf8_loc_elem',
-    ('baseindex', lltype.Signed),
-    ('ofs', lltype.FixedSizeArray(lltype.Char, 16)))
-    ))))
+    ('contents', lltype.Ptr(lltype.GcArray(lltype.Struct('utf8_loc_elem',
+        ('baseindex', lltype.Signed),
+        ('ofs', lltype.FixedSizeArray(lltype.Char, 16)),
+    )))))
 
-def unichr_to_flag(ch):
-    if ch <= 0x7F:
+def get_flag_from_code(oc):
+    if oc <= 0x7F:
         return FLAG_ASCII
-    elif 0xD800 <= ch <= 0xDFFF:
+    if 0xD800 <= oc <= 0xDFFF:
         return FLAG_HAS_SURROGATES
     return FLAG_REGULAR
 
-FLAG_REGULAR = 0
-FLAG_HAS_SURROGATES = 1
-FLAG_ASCII = 2
+def combine_flags(one, two):
+    return one | two
+
+FLAG_ASCII          = 0     # no bits
+FLAG_REGULAR        = 1     # bit 0
+FLAG_HAS_SURROGATES = 3     # bit 0 and bit 1
 # note that we never need index storage if we're pure ascii, but it's useful
 # for passing into W_UnicodeObject.__init__