[pypy-commit] pypy unicode-utf8: finish whacking until the objspace tests pass

Sat Nov 4 18:16:55 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92935:47de95da2bbb
Date: 2017-11-04 15:26 +0100
http://bitbucket.org/pypy/pypy/changeset/47de95da2bbb/

Log:	finish whacking until the objspace tests pass

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -8,3 +8,4 @@
 * better flag handling in split/splitlines maybe?
 * find all the fast-paths that we want to do with utf8 (we only do
   utf-8 now, not UTF8 or utf8) for decode/encode
+* encode_error_handler has XXX
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -20,11 +20,13 @@
 @specialize.memo()
 def encode_error_handler(space):
     # Fast version of the "strict" errors handler.
-    def raise_unicode_exception_encode(errors, encoding, msg, w_u,
+    def raise_unicode_exception_encode(errors, encoding, msg, u, u_len,
                                        startingpos, endingpos):
+        # XXX fix once we stop using runicode.py
+        flag = _get_flag(u.decode('utf8'))
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
-                                             w_u,
+                                             space.newutf8(u, u_len, flag),
                                              space.newint(startingpos),
                                              space.newint(endingpos),
                                              space.newtext(msg)]))
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -164,7 +164,7 @@
         if isinstance(x, str):
             return self.newtext(x)
         if isinstance(x, unicode):
-            return self.newutf8(x.encode('utf8'), len(x))
+            return self.newutf8(x.encode('utf8'), len(x), rutf8.FLAG_REGULAR)
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):
diff --git a/pypy/objspace/std/test/test_index.py b/pypy/objspace/std/test/test_index.py
--- a/pypy/objspace/std/test/test_index.py
+++ b/pypy/objspace/std/test/test_index.py
@@ -1,5 +1,7 @@
 from py.test import raises
 
+from rpython.rlib import rutf8
+
 class AppTest_IndexProtocol:
     def setup_class(self):
         w_oldstyle = self.space.appexec([], """():
@@ -263,7 +265,8 @@
 class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase):
     def setup_method(self, method):
         SeqTestCase.setup_method(self, method)
-        self.w_seq = self.space.wrap(u"this is a test")
+        self.w_seq = self.space.newutf8("this is a test", len("this is a test"),
+                                        rutf8.FLAG_ASCII)
         self.w_const = self.space.appexec([], """(): return unicode""")
 
 
diff --git a/pypy/objspace/std/test/test_lengthhint.py b/pypy/objspace/std/test/test_lengthhint.py
--- a/pypy/objspace/std/test/test_lengthhint.py
+++ b/pypy/objspace/std/test/test_lengthhint.py
@@ -1,3 +1,6 @@
+
+from rpython.rlib import rutf8
+
 from pypy.module._collections.interp_deque import W_Deque
 from pypy.module.itertools.interp_itertools import W_Repeat
 
@@ -71,7 +74,8 @@
         self._test_length_hint(self.space.wrap('P' * self.SIZE))
 
     def test_unicode(self):
-        self._test_length_hint(self.space.wrap(u'Y' * self.SIZE))
+        self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE,
+                                                  rutf8.FLAG_ASCII))
 
     def test_tuple(self):
         self._test_length_hint(self.space.wrap(tuple(self.ITEMS)))
diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -22,7 +22,7 @@
                           BytesListStrategy)
         #assert isinstance(W_ListObject(space, [w(u'a'), w(u'b')]).strategy,
         #                  UnicodeListStrategy)
-        assert isinstance(W_ListObject(space, [w(u'a'), wb('b')]).strategy,
+        assert isinstance(W_ListObject(space, [space.newutf8('a', 1, 0), wb('b')]).strategy,
                           ObjectListStrategy) # mixed unicode and bytes
 
     def test_empty_to_any(self):
diff --git a/pypy/objspace/std/test/test_obj.py b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -17,7 +17,7 @@
         cls.w_cpython_apptest = space.wrap(option.runappdirect and not hasattr(sys, 'pypy_translation_info'))
 
         def w_unwrap_wrap_unicode(space, w_obj):
-            return space.newutf8(space.utf8_w(w_obj), w_obj._length)
+            return space.newutf8(space.utf8_w(w_obj), w_obj._length, w_obj._get_flag())
         cls.w_unwrap_wrap_unicode = space.wrap(gateway.interp2app(w_unwrap_wrap_unicode))
         def w_unwrap_wrap_str(space, w_obj):
             return space.wrap(space.str_w(w_obj))
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1834,7 +1834,7 @@
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
     unistr = w_unistr._utf8
-    result = ['\0'] * len(unistr)
+    result = ['\0'] * w_unistr._length
     digits = ['0', '1', '2', '3', '4',
               '5', '6', '7', '8', '9']
     i = 0
@@ -1843,6 +1843,8 @@
         uchr = rutf8.codepoint_at_pos(unistr, i)
         if rutf8.isspace(unistr, i):
             result[res_pos] = ' '
+            res_pos += 1
+            i = rutf8.next_codepoint_pos(unistr, i)
             continue
         try:
             result[res_pos] = digits[unicodedb.decimal(uchr)]