[pypy-commit] pypy rpython-utf8: add the possibility of doing x.encode('utf-8') in rpython

Thu Aug 30 16:24:57 CEST 2012

Author: Antonio Cuni <anto.cuni at gmail.com>
Branch: rpython-utf8
Changeset: r56941:c3807d8cd57a
Date: 2012-08-30 16:06 +0200
http://bitbucket.org/pypy/pypy/changeset/c3807d8cd57a/

Log:	add the possibility of doing x.encode('utf-8') in rpython

diff --git a/pypy/annotation/unaryop.py b/pypy/annotation/unaryop.py
--- a/pypy/annotation/unaryop.py
+++ b/pypy/annotation/unaryop.py
@@ -530,7 +530,7 @@
         if not s_enc.is_constant():
             raise TypeError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1'):
+        if enc not in ('ascii', 'latin-1', 'utf-8'):
             raise TypeError("Encoding %s not supported for unicode" % (enc,))
         return SomeString()
     method_encode.can_only_throw = [UnicodeEncodeError]
diff --git a/pypy/rpython/lltypesystem/rstr.py b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -187,6 +187,14 @@
             result.chars[i] = cast_primitive(Char, c)
         return result
 
+    @jit.elidable
+    def ll_encode_utf8(self, ll_s):
+        from pypy.rpython.annlowlevel import hlunicode, llstr
+        from pypy.rlib.runicode import unicode_encode_utf_8
+        s = hlunicode(ll_s)
+        bytes = unicode_encode_utf_8(s, len(s), 'strict')
+        return llstr(bytes)
+
 class CharRepr(AbstractCharRepr, StringRepr):
     lowleveltype = Char
 
diff --git a/pypy/rpython/ootypesystem/rstr.py b/pypy/rpython/ootypesystem/rstr.py
--- a/pypy/rpython/ootypesystem/rstr.py
+++ b/pypy/rpython/ootypesystem/rstr.py
@@ -98,6 +98,13 @@
             sb.ll_append_char(cast_primitive(Char, c))
         return sb.ll_build()
 
+    def ll_encode_utf8(self, ll_s):
+        from pypy.rpython.annlowlevel import hlunicode, oostr
+        from pypy.rlib.runicode import unicode_encode_utf_8
+        s = hlunicode(ll_s)
+        bytes = unicode_encode_utf_8(s, len(s), 'strict')
+        return oostr(bytes)
+
 class CharRepr(AbstractCharRepr, StringRepr):
     lowleveltype = Char
 
diff --git a/pypy/rpython/rstr.py b/pypy/rpython/rstr.py
--- a/pypy/rpython/rstr.py
+++ b/pypy/rpython/rstr.py
@@ -340,6 +340,8 @@
             return hop.gendirectcall(self.ll_str, v_self)
         elif encoding == "latin-1":
             return hop.gendirectcall(self.ll_encode_latin1, v_self)
+        elif encoding == 'utf-8':
+            return hop.gendirectcall(self.ll_encode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
 
diff --git a/pypy/rpython/test/test_runicode.py b/pypy/rpython/test/test_runicode.py
--- a/pypy/rpython/test/test_runicode.py
+++ b/pypy/rpython/test/test_runicode.py
@@ -100,7 +100,7 @@
     def test_unicode_encode(self):
         def f(x):
             y = u'xxx'
-            return (y + unichr(x)).encode('ascii') + y.encode('latin-1')
+            return (y + unichr(x)).encode('ascii') + y.encode('latin-1') + y.encode('utf-8')
 
         assert self.ll_to_string(self.interpret(f, [38])) == f(38)