[pypy-commit] pypy unicode-utf8: start working on pypyjson

Fri Nov 24 04:04:41 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93155:109fd5f5d4eb
Date: 2017-11-23 20:52 +0100
http://bitbucket.org/pypy/pypy/changeset/109fd5f5d4eb/

Log:	start working on pypyjson

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1760,10 +1760,6 @@
     def utf8_w(self, w_obj):
         return w_obj.utf8_w(self)
 
-    def unicode_w(self, w_obj):
-        # XXX: kill me!
-        return w_obj.utf8_w(self).decode('utf-8')
-
     def convert_to_w_unicode(self, w_obj):
         return w_obj.convert_to_w_unicode(self)
 
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,7 +1,7 @@
 import sys
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize, always_inline, r_dict
-from rpython.rlib import rfloat, runicode
+from rpython.rlib import rfloat, runicode, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
 from pypy.interpreter.error import oefmt
 from pypy.interpreter import unicodehelper
@@ -19,29 +19,6 @@
         return 0.0
     return x * NEG_POW_10[exp]
 
-def strslice2unicode_latin1(s, start, end):
-    """
-    Convert s[start:end] to unicode. s is supposed to be an RPython string
-    encoded in latin-1, which means that the numeric value of each char is the
-    same as the corresponding unicode code point.
-
-    Internally it's implemented at the level of low-level helpers, to avoid
-    the extra copy we would need if we take the actual slice first.
-
-    No bound checking is done, use carefully.
-    """
-    from rpython.rtyper.annlowlevel import llstr, hlunicode
-    from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
-    from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
-    length = end-start
-    ll_s = llstr(s)
-    ll_res = malloc(UNICODE, length)
-    ll_res.hash = 0
-    for i in range(length):
-        ch = ll_s.chars[start+i]
-        ll_res.chars[i] = cast_primitive(UniChar, ch)
-    return hlunicode(ll_res)
-
 def slice_eq(a, b):
     (ll_chars1, start1, length1, _) = a
     (ll_chars2, start2, length2, _) = b
@@ -312,8 +289,7 @@
             bits |= ord(ch)
             if ch == '"':
                 self.pos = i
-                return self.space.newunicode(
-                        self._create_string(start, i - 1, bits))
+                return self._create_string(start, i - 1, bits)
             elif ch == '\\' or ch < '\x20':
                 self.pos = i-1
                 return self.decode_string_escaped(start)
@@ -322,12 +298,15 @@
         if bits & 0x80:
             # the 8th bit is set, it's an utf8 string
             content_utf8 = self.getslice(start, end)
-            return unicodehelper.decode_utf8(self.space, content_utf8)
+            lgt, flag = unicodehelper.check_utf8_or_raise(self.space,
+                                                          content_utf8)
+            return self.space.newutf8(content_utf8, lgt, flag)
         else:
             # ascii only, fast path (ascii is a strict subset of
             # latin1, and we already checked that all the chars are <
             # 128)
-            return strslice2unicode_latin1(self.s, start, end)
+            return self.space.newutf8(self.getslice(start, end),
+                                      end - start, rutf8.FLAG_ASCII)
 
     def decode_string_escaped(self, start):
         i = self.pos
@@ -340,9 +319,10 @@
             i += 1
             if ch == '"':
                 content_utf8 = builder.build()
-                content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
+                lgt, f = unicodehelper.check_utf8_or_raise(self.space,
+                                                           content_utf8)
                 self.pos = i
-                return self.space.newunicode(content_unicode)
+                return self.space.newutf8(content_utf8, lgt, f)
             elif ch == '\\':
                 i = self.decode_escape_sequence(i, builder)
             elif ch < '\x20':
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -10,10 +10,14 @@
     assert dec.skip_whitespace(8) == len(s)
     dec.close()
 
+class FakeSpace(object):
+    def newutf8(self, s, l, f):
+        return s
+
 def test_decode_key():
     s1 = "123" * 100
     s = ' "%s"   "%s" ' % (s1, s1)
-    dec = JSONDecoder('fake space', s)
+    dec = JSONDecoder(FakeSpace(), s)
     assert dec.pos == 0
     x = dec.decode_key(0)
     assert x == s1
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,23 +367,10 @@
         assert isinstance(utf8s, str)
         return W_UnicodeObject(utf8s, length, flag)
 
-    def new_from_utf8(self, utf8s):
-        # XXX: kill me!
-        assert isinstance(utf8s, str)
-        length, flag = rutf8.check_utf8(utf8s, True)
-        return W_UnicodeObject(utf8s, length, flag)
-
     def newfilename(self, s):
         assert isinstance(s, str) # on pypy3, this decodes the byte string
         return W_BytesObject(s)   # with the filesystem encoding
 
-    def newunicode(self, unistr):
-        # XXX: kill me!
-        assert isinstance(unistr, unicode)
-        utf8s = unistr.encode("utf-8")
-        length, flag = rutf8.check_utf8(utf8s, True)
-        return self.newutf8(utf8s, length, flag)
-
     def type(self, w_obj):
         jit.promote(w_obj.__class__)
         return w_obj.getclass(self)