[pypy-commit] pypy unicode-utf8: whack sre until it compiles

fijal pypy.commits at gmail.com
Sun Dec 3 16:25:26 EST 2017


Author: fijal
Branch: unicode-utf8
Changeset: r93268:831913dc603e
Date: 2017-12-03 22:14 +0100
http://bitbucket.org/pypy/pypy/changeset/831913dc603e/

Log:	whack sre until it compiles

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,7 +6,7 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rutf8 import Utf8StringBuilder
 
@@ -42,7 +42,9 @@
         if isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string[start:end])
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            return space.newunicode(ctx._unicodestr[start:end])
+            s = ctx._unicodestr[start:end]
+            lgt, flag = rutf8.check_utf8(s, True)
+            return space.newutf8(s, lgt, flag)
         else:
             # unreachable
             raise SystemError
@@ -110,7 +112,9 @@
         if endpos < pos:
             endpos = pos
         if space.isinstance_w(w_string, space.w_unicode):
-            unicodestr = space.unicode_w(w_string)
+            unicodestr = space.utf8_w(w_string)
+            # XXX will fail some tests, the length need to be adjusted for
+            #     real char len etc
             if pos > len(unicodestr):
                 pos = len(unicodestr)
             if endpos > len(unicodestr):
@@ -341,7 +345,7 @@
                                      unicodebuilder.get_flag()), n
         else:
             if space.isinstance_w(w_string, space.w_unicode):
-                w_emptystr = space.newunicode(u'')
+                w_emptystr = space.newutf8('', 0, rutf8.FLAG_ASCII)
             else:
                 w_emptystr = space.newbytes('')
             w_item = space.call_method(w_emptystr, 'join',
@@ -575,7 +579,8 @@
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            return space.newunicode(ctx._unicodestr)
+            lgt, flag = rutf8.check_utf8(ctx._unicodestr, True)
+            return space.newutf8(ctx._unicodestr, lgt, flag)
         else:
             raise SystemError
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -311,10 +311,10 @@
         return res, flag
     raise CheckError(~res)
 
-def get_utf8_length_flag(s):
+def get_utf8_length_flag(s, start=0, end=-1):
     """ Get the length and flag out of valid utf8. For now just calls check_utf8
     """
-    return check_utf8(s, True)
+    return check_utf8(s, True, start, end)
 
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):
@@ -694,6 +694,12 @@
         self._lgt += newlgt
         self._flag = combine_flags(self._flag, newflag)
 
+    def append_slice(self, s, start, end):
+        self._s.append_slice(s, start, end)
+        newlgt, newflag = get_utf8_length_flag(s, start, end)
+        self._lgt += newlgt
+        self._flag = combine_flags(self._flag, newflag)
+
     @signature(char(), returns=none())
     def append_char(self, s):
         # for characters, ascii


More information about the pypy-commit mailing list