[pypy-commit] pypy default: merge the rpython-utf8 branch: it is now possible to use the codec 'utf-8' when doing .encode()/.decode() in rpython
antocuni
noreply at buildbot.pypy.org
Thu Aug 30 17:16:07 CEST 2012
Author: Antonio Cuni <anto.cuni at gmail.com>
Branch:
Changeset: r56947:25e17cd05cdc
Date: 2012-08-30 17:15 +0200
http://bitbucket.org/pypy/pypy/changeset/25e17cd05cdc/
Log: merge the rpython-utf8 branch: it is now possible to use the codec
'utf-8' when doing .encode()/.decode() in rpython
diff --git a/pypy/annotation/unaryop.py b/pypy/annotation/unaryop.py
--- a/pypy/annotation/unaryop.py
+++ b/pypy/annotation/unaryop.py
@@ -530,7 +530,7 @@
if not s_enc.is_constant():
raise TypeError("Non-constant encoding not supported")
enc = s_enc.const
- if enc not in ('ascii', 'latin-1'):
+ if enc not in ('ascii', 'latin-1', 'utf-8'):
raise TypeError("Encoding %s not supported for unicode" % (enc,))
return SomeString()
method_encode.can_only_throw = [UnicodeEncodeError]
@@ -553,7 +553,7 @@
if not s_enc.is_constant():
raise TypeError("Non-constant encoding not supported")
enc = s_enc.const
- if enc not in ('ascii', 'latin-1'):
+ if enc not in ('ascii', 'latin-1', 'utf-8'):
raise TypeError("Encoding %s not supported for strings" % (enc,))
return SomeUnicodeString()
method_decode.can_only_throw = [UnicodeDecodeError]
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -47,12 +47,10 @@
def raise_unicode_exception_decode(errors, encoding, msg, s,
startingpos, endingpos):
- assert isinstance(s, str)
raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
def raise_unicode_exception_encode(errors, encoding, msg, u,
startingpos, endingpos):
- assert isinstance(u, unicode)
raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
# ____________________________________________________________
diff --git a/pypy/rpython/lltypesystem/rstr.py b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -143,6 +143,13 @@
s.chars[i] = cast_primitive(UniChar, value.chars[i])
return s
+ def ll_decode_utf8(self, llvalue):
+ from pypy.rpython.annlowlevel import hlstr, llunicode
+ from pypy.rlib.runicode import str_decode_utf_8
+ value = hlstr(llvalue)
+ univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+ return llunicode(univalue)
+
class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
lowleveltype = Ptr(UNICODE)
basetype = basestring
@@ -187,6 +194,14 @@
result.chars[i] = cast_primitive(Char, c)
return result
+ @jit.elidable
+ def ll_encode_utf8(self, ll_s):
+ from pypy.rpython.annlowlevel import hlunicode, llstr
+ from pypy.rlib.runicode import unicode_encode_utf_8
+ s = hlunicode(ll_s)
+ bytes = unicode_encode_utf_8(s, len(s), 'strict')
+ return llstr(bytes)
+
class CharRepr(AbstractCharRepr, StringRepr):
lowleveltype = Char
diff --git a/pypy/rpython/ootypesystem/rstr.py b/pypy/rpython/ootypesystem/rstr.py
--- a/pypy/rpython/ootypesystem/rstr.py
+++ b/pypy/rpython/ootypesystem/rstr.py
@@ -60,6 +60,13 @@
sb.ll_append_char(cast_primitive(UniChar, c))
return sb.ll_build()
+ def ll_decode_utf8(self, llvalue):
+ from pypy.rpython.annlowlevel import hlstr, oounicode
+ from pypy.rlib.runicode import str_decode_utf_8
+ value = hlstr(llvalue)
+ univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+ return oounicode(univalue)
+
class UnicodeRepr(BaseOOStringRepr, AbstractUnicodeRepr):
lowleveltype = ootype.Unicode
@@ -98,6 +105,13 @@
sb.ll_append_char(cast_primitive(Char, c))
return sb.ll_build()
+ def ll_encode_utf8(self, ll_s):
+ from pypy.rpython.annlowlevel import hlunicode, oostr
+ from pypy.rlib.runicode import unicode_encode_utf_8
+ s = hlunicode(ll_s)
+ bytes = unicode_encode_utf_8(s, len(s), 'strict')
+ return oostr(bytes)
+
class CharRepr(AbstractCharRepr, StringRepr):
lowleveltype = Char
diff --git a/pypy/rpython/rstr.py b/pypy/rpython/rstr.py
--- a/pypy/rpython/rstr.py
+++ b/pypy/rpython/rstr.py
@@ -309,6 +309,8 @@
return hop.gendirectcall(self.ll.ll_str2unicode, v_self)
elif encoding == 'latin-1':
return hop.gendirectcall(self.ll_decode_latin1, v_self)
+ elif encoding == 'utf-8':
+ return hop.gendirectcall(self.ll_decode_utf8, v_self)
else:
raise TyperError("encoding %s not implemented" % (encoding, ))
@@ -340,6 +342,8 @@
return hop.gendirectcall(self.ll_str, v_self)
elif encoding == "latin-1":
return hop.gendirectcall(self.ll_encode_latin1, v_self)
+ elif encoding == 'utf-8':
+ return hop.gendirectcall(self.ll_encode_utf8, v_self)
else:
raise TyperError("encoding %s not implemented" % (encoding, ))
diff --git a/pypy/rpython/test/test_runicode.py b/pypy/rpython/test/test_runicode.py
--- a/pypy/rpython/test/test_runicode.py
+++ b/pypy/rpython/test/test_runicode.py
@@ -98,9 +98,11 @@
self.interpret_raises(UnicodeEncodeError, f, [1234])
def test_unicode_encode(self):
- def f(x):
- y = u'xxx'
- return (y + unichr(x)).encode('ascii') + y.encode('latin-1')
+ def f(n):
+ x = u'xxx' + unichr(n)
+ y = u'àèì' + unichr(n)
+ z = u'美' + unichr(n)
+ return x.encode('ascii') + y.encode('latin-1') + z.encode('utf-8')
assert self.ll_to_string(self.interpret(f, [38])) == f(38)
@@ -128,11 +130,14 @@
assert self.interpret(f, [300, False]) == f(300, False)
def test_unicode_decode(self):
- def f(x):
- y = 'xxx'
- return (y + chr(x)).decode('ascii') + chr(x).decode("latin-1")
+ strings = ['xxx', u'àèì'.encode('latin-1'), u'美'.encode('utf-8')]
+ def f(n):
+ x = strings[n]
+ y = strings[n+1]
+ z = strings[n+2]
+ return x.decode('ascii') + y.decode('latin-1') + z.decode('utf-8')
- assert self.ll_to_string(self.interpret(f, [38])) == f(38)
+ assert self.ll_to_string(self.interpret(f, [0])) == f(0)
def test_unicode_decode_error(self):
def f(x):
More information about the pypy-commit
mailing list