[pypy-commit] pypy py3k: store unicode for intern'd strings, fixes a long standing bug where interning a

pjenvey noreply at buildbot.pypy.org
Mon Dec 8 05:34:25 CET 2014


Author: Philip Jenvey <pjenvey at underboss.org>
Branch: py3k
Changeset: r74857:001db61e74f0
Date: 2014-12-07 20:33 -0800
http://bitbucket.org/pypy/pypy/changeset/001db61e74f0/

Log:	store unicode for intern'd strings, fixes a long standing bug where
	interning a string w/ surrogates would fail

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -362,7 +362,7 @@
         self.builtin_modules = {}
         self.reloading_modules = {}
 
-        self.interned_strings = make_weak_value_dictionary(self, str, W_Root)
+        self.interned_strings = make_weak_value_dictionary(self, unicode, W_Root)
         self.actionflag = ActionFlag()    # changed by the signal module
         self.check_signal_action = None   # changed by the signal module
         self.user_del_action = UserDelAction(self)
@@ -765,31 +765,35 @@
         else:
             return self.w_False
 
-    def new_interned_w_str(self, w_s):
-        assert isinstance(w_s, W_Root)   # and is not None
-        s = self.str_w(w_s)
+    def new_interned_w_str(self, w_u):
+        assert isinstance(w_u, W_Root)   # and is not None
+        u = self.unicode_w(w_u)
+        if not we_are_translated():
+            assert type(u) is unicode
+        w_u1 = self.interned_strings.get(u)
+        if w_u1 is None:
+            w_u1 = w_u
+            self.interned_strings.set(u, w_u1)
+        return w_u1
+
+    def new_interned_str(self, s):
+        """Assumes an identifier (utf-8 encoded str)"""
         if not we_are_translated():
             assert type(s) is str
-        w_s1 = self.interned_strings.get(s)
+        u = s.decode('utf-8')
+        w_s1 = self.interned_strings.get(u)
         if w_s1 is None:
-            w_s1 = w_s
-            self.interned_strings.set(s, w_s1)
-        return w_s1
-
-    def new_interned_str(self, s):
-        if not we_are_translated():
-            assert type(s) is str
-        w_s1 = self.interned_strings.get(s)
-        if w_s1 is None:
-            w_s1 = self.wrap(s)
-            self.interned_strings.set(s, w_s1)
+            w_s1 = self.wrap(u)
+            self.interned_strings.set(u, w_s1)
         return w_s1
 
     def is_interned_str(self, s):
+        """Assumes an identifier (utf-8 encoded str)"""
         # interface for marshal_impl
         if not we_are_translated():
             assert type(s) is str
-        return self.interned_strings.get(s) is not None
+        u = s.decode('utf-8')
+        return self.interned_strings.get(u) is not None
 
     def descr_self_interp_w(self, RequiredClass, w_obj):
         if not isinstance(w_obj, RequiredClass):
diff --git a/pypy/module/sys/test/test_sysmodule.py b/pypy/module/sys/test/test_sysmodule.py
--- a/pypy/module/sys/test/test_sysmodule.py
+++ b/pypy/module/sys/test/test_sysmodule.py
@@ -648,6 +648,9 @@
         assert s3 != s2
         s4 = s3.swapcase()
         assert intern(s4) is s2
+        s5 = "\ud800"
+        # previously failed
+        assert intern(s5) == s5
 
 
 class AppTestSysExcInfoDirect:


More information about the pypy-commit mailing list