[pypy-commit] pypy py3k: store both the unicode string and the utf8-encoded bytes for unicode objects, to avoid to do the encoding every time we call identifier_w to it. In the future, we might optimize W_UnicodeObject by storing only the utf-8 encoded bytes but for now it is easier to keep both. This change also fixes the methodcache tests, because they rely on the identity of the unwrapped string returned by str_w().

Mon Sep 3 12:27:01 CEST 2012

Author: Antonio Cuni <anto.cuni at gmail.com>
Branch: py3k
Changeset: r57094:991b4665f673
Date: 2012-09-03 12:26 +0200
http://bitbucket.org/pypy/pypy/changeset/991b4665f673/

Log:	store both the unicode string and the utf8-encoded bytes for unicode
	objects, to avoid to do the encoding every time we call identifier_w
	to it. In the future, we might optimize W_UnicodeObject by storing
	*only* the utf-8 encoded bytes but for now it is easier to keep
	both. This change also fixes the methodcache tests, because they
	rely on the identity of the unwrapped string returned by str_w().

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -205,6 +205,10 @@
         raise OperationError(space.w_TypeError,
                              typed_unwrap_error_msg(space, "string", self))
 
+    def identifier_w(self, space):
+        raise OperationError(space.w_TypeError,
+                             typed_unwrap_error_msg(space, "string", self))
+
     def int_w(self, space):
         raise OperationError(space.w_TypeError,
                              typed_unwrap_error_msg(space, "integer", self))
@@ -1332,12 +1336,15 @@
         return self.str_w(w_obj)
 
     def str_w(self, w_obj):
+        """
+        if w_obj is unicode, call identifier_w() (i.e., return the UTF-8
+        encoded string). Else, call bytes_w().
+        
+        Maybe we should kill str_w completely and manually substitute it with
+        identifier_w/bytes_w at all call sites?
+        """
         if self.isinstance_w(w_obj, self.w_unicode):
-            try:
-                return self.unicode_w(w_obj).encode('ascii')
-            except UnicodeEncodeError:
-                w_bytes = self.call_method(w_obj, 'encode', self.wrap('utf-8'))
-                return self.bytes_w(w_bytes)
+            return w_obj.identifier_w(self)
         else:
             return w_obj.bytes_w(self)
 
@@ -1404,7 +1411,7 @@
         variables, methdods, functions, classes etc.). In py3k, identifiers
         are unicode strings and are unwrapped as UTF-8 encoded byte strings.
         """
-        return self.unicode_w(w_obj).encode('utf-8')
+        return w_obj.identifier_w(self)
 
     def bool_w(self, w_obj):
         # Unwraps a bool, also accepting an int for compatibility.
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -40,11 +40,12 @@
 
 class W_UnicodeObject(W_AbstractUnicodeObject):
     from pypy.objspace.std.unicodetype import unicode_typedef as typedef
-    _immutable_fields_ = ['_value']
+    _immutable_fields_ = ['_value', '_utf8']
 
     def __init__(w_self, unistr):
         assert isinstance(unistr, unicode)
         w_self._value = unistr
+        w_self._utf8 = unistr.encode('utf-8')
 
     def __repr__(w_self):
         """ representation for debugging purposes """
@@ -62,6 +63,9 @@
     def unicode_w(self, space):
         return self._value
 
+    def identifier_w(self, space):
+        return self._utf8
+
 W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
 
 registerimplementation(W_UnicodeObject)