[pypy-commit] pypy default: merge the rpython-utf8 branch: it is now possible to use the codec 'utf-8' when doing .encode()/.decode() in rpython

Thu Aug 30 17:16:07 CEST 2012

Author: Antonio Cuni <anto.cuni at gmail.com>
Branch: 
Changeset: r56947:25e17cd05cdc
Date: 2012-08-30 17:15 +0200
http://bitbucket.org/pypy/pypy/changeset/25e17cd05cdc/

Log:	merge the rpython-utf8 branch: it is now possible to use the codec
	'utf-8' when doing .encode()/.decode() in rpython

diff --git a/pypy/annotation/unaryop.py b/pypy/annotation/unaryop.py
--- a/pypy/annotation/unaryop.py
+++ b/pypy/annotation/unaryop.py
@@ -530,7 +530,7 @@
         if not s_enc.is_constant():
             raise TypeError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1'):
+        if enc not in ('ascii', 'latin-1', 'utf-8'):
             raise TypeError("Encoding %s not supported for unicode" % (enc,))
         return SomeString()
     method_encode.can_only_throw = [UnicodeEncodeError]
@@ -553,7 +553,7 @@
         if not s_enc.is_constant():
             raise TypeError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1'):
+        if enc not in ('ascii', 'latin-1', 'utf-8'):
             raise TypeError("Encoding %s not supported for strings" % (enc,))
         return SomeUnicodeString()
     method_decode.can_only_throw = [UnicodeDecodeError]
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -47,12 +47,10 @@
 
 def raise_unicode_exception_decode(errors, encoding, msg, s,
                                    startingpos, endingpos):
-    assert isinstance(s, str)
     raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
 
 def raise_unicode_exception_encode(errors, encoding, msg, u,
                                    startingpos, endingpos):
-    assert isinstance(u, unicode)
     raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
 
 # ____________________________________________________________
diff --git a/pypy/rpython/lltypesystem/rstr.py b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -143,6 +143,13 @@
             s.chars[i] = cast_primitive(UniChar, value.chars[i])
         return s
 
+    def ll_decode_utf8(self, llvalue):
+        from pypy.rpython.annlowlevel import hlstr, llunicode
+        from pypy.rlib.runicode import str_decode_utf_8
+        value = hlstr(llvalue)
+        univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+        return llunicode(univalue)
+
 class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
     lowleveltype = Ptr(UNICODE)
     basetype = basestring
@@ -187,6 +194,14 @@
             result.chars[i] = cast_primitive(Char, c)
         return result
 
+    @jit.elidable
+    def ll_encode_utf8(self, ll_s):
+        from pypy.rpython.annlowlevel import hlunicode, llstr
+        from pypy.rlib.runicode import unicode_encode_utf_8
+        s = hlunicode(ll_s)
+        bytes = unicode_encode_utf_8(s, len(s), 'strict')
+        return llstr(bytes)
+
 class CharRepr(AbstractCharRepr, StringRepr):
     lowleveltype = Char
 
diff --git a/pypy/rpython/ootypesystem/rstr.py b/pypy/rpython/ootypesystem/rstr.py
--- a/pypy/rpython/ootypesystem/rstr.py
+++ b/pypy/rpython/ootypesystem/rstr.py
@@ -60,6 +60,13 @@
             sb.ll_append_char(cast_primitive(UniChar, c))
         return sb.ll_build()
 
+    def ll_decode_utf8(self, llvalue):
+        from pypy.rpython.annlowlevel import hlstr, oounicode
+        from pypy.rlib.runicode import str_decode_utf_8
+        value = hlstr(llvalue)
+        univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+        return oounicode(univalue)
+
 
 class UnicodeRepr(BaseOOStringRepr, AbstractUnicodeRepr):
     lowleveltype = ootype.Unicode
@@ -98,6 +105,13 @@
             sb.ll_append_char(cast_primitive(Char, c))
         return sb.ll_build()
 
+    def ll_encode_utf8(self, ll_s):
+        from pypy.rpython.annlowlevel import hlunicode, oostr
+        from pypy.rlib.runicode import unicode_encode_utf_8
+        s = hlunicode(ll_s)
+        bytes = unicode_encode_utf_8(s, len(s), 'strict')
+        return oostr(bytes)
+
 class CharRepr(AbstractCharRepr, StringRepr):
     lowleveltype = Char
 
diff --git a/pypy/rpython/rstr.py b/pypy/rpython/rstr.py
--- a/pypy/rpython/rstr.py
+++ b/pypy/rpython/rstr.py
@@ -309,6 +309,8 @@
             return hop.gendirectcall(self.ll.ll_str2unicode, v_self)
         elif encoding == 'latin-1':
             return hop.gendirectcall(self.ll_decode_latin1, v_self)
+        elif encoding == 'utf-8':
+            return hop.gendirectcall(self.ll_decode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
 
@@ -340,6 +342,8 @@
             return hop.gendirectcall(self.ll_str, v_self)
         elif encoding == "latin-1":
             return hop.gendirectcall(self.ll_encode_latin1, v_self)
+        elif encoding == 'utf-8':
+            return hop.gendirectcall(self.ll_encode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
 
diff --git a/pypy/rpython/test/test_runicode.py b/pypy/rpython/test/test_runicode.py
--- a/pypy/rpython/test/test_runicode.py
+++ b/pypy/rpython/test/test_runicode.py
@@ -98,9 +98,11 @@
         self.interpret_raises(UnicodeEncodeError, f, [1234])
 
     def test_unicode_encode(self):
-        def f(x):
-            y = u'xxx'
-            return (y + unichr(x)).encode('ascii') + y.encode('latin-1')
+        def f(n):
+            x = u'xxx' + unichr(n)
+            y = u'&#224;&#232;&#236;' + unichr(n)
+            z = u'&#32654;' + unichr(n)
+            return x.encode('ascii') + y.encode('latin-1') + z.encode('utf-8')
 
         assert self.ll_to_string(self.interpret(f, [38])) == f(38)
 
@@ -128,11 +130,14 @@
         assert self.interpret(f, [300, False]) == f(300, False)
 
     def test_unicode_decode(self):
-        def f(x):
-            y = 'xxx'
-            return (y + chr(x)).decode('ascii') + chr(x).decode("latin-1") 
+        strings = ['xxx', u'&#224;&#232;&#236;'.encode('latin-1'), u'&#32654;'.encode('utf-8')]
+        def f(n):
+            x = strings[n]
+            y = strings[n+1]
+            z = strings[n+2]
+            return x.decode('ascii') + y.decode('latin-1') + z.decode('utf-8')
 
-        assert self.ll_to_string(self.interpret(f, [38])) == f(38)
+        assert self.ll_to_string(self.interpret(f, [0])) == f(0)
 
     def test_unicode_decode_error(self):
         def f(x):