[Python-3000-checkins] r55583 - in python/branches/py3k-struni: Include/stringobject.h Include/unicodeobject.h Modules/main.c Objects/boolobject.c Objects/unicodeobject.c

walter.doerwald python-3000-checkins at python.org
Fri May 25 15:52:16 CEST 2007


Author: walter.doerwald
Date: Fri May 25 15:52:07 2007
New Revision: 55583

Modified:
   python/branches/py3k-struni/Include/stringobject.h
   python/branches/py3k-struni/Include/unicodeobject.h
   python/branches/py3k-struni/Modules/main.c
   python/branches/py3k-struni/Objects/boolobject.c
   python/branches/py3k-struni/Objects/unicodeobject.c
Log:
Add interning of unicode strings by copying the functionality from
stringobject.c.

Intern "True" and "False" in bool_repr() again as it was in the
8bit string era.


Modified: python/branches/py3k-struni/Include/stringobject.h
==============================================================================
--- python/branches/py3k-struni/Include/stringobject.h	(original)
+++ python/branches/py3k-struni/Include/stringobject.h	Fri May 25 15:52:07 2007
@@ -48,10 +48,6 @@
      */
 } PyStringObject;
 
-#define SSTATE_NOT_INTERNED 0
-#define SSTATE_INTERNED_MORTAL 1
-#define SSTATE_INTERNED_IMMORTAL 2
-
 PyAPI_DATA(PyTypeObject) PyBaseString_Type;
 PyAPI_DATA(PyTypeObject) PyString_Type;
 

Modified: python/branches/py3k-struni/Include/unicodeobject.h
==============================================================================
--- python/branches/py3k-struni/Include/unicodeobject.h	(original)
+++ python/branches/py3k-struni/Include/unicodeobject.h	Fri May 25 15:52:07 2007
@@ -390,6 +390,9 @@
     Py_ssize_t length;		/* Length of raw Unicode data in buffer */
     Py_UNICODE *str;		/* Raw Unicode buffer */
     long hash;			/* Hash value; -1 if not set */
+    int state;			/* != 0 if interned. In this case the two
+    				 * references from the dictionary to this object
+    				 * are *not* counted in ob_refcnt. */
     PyObject *defenc;		/* (Default) Encoded version as Python
 				   string, or NULL; this is used for
 				   implementing the buffer protocol */
@@ -397,6 +400,10 @@
 
 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
 
+#define SSTATE_NOT_INTERNED 0
+#define SSTATE_INTERNED_MORTAL 1
+#define SSTATE_INTERNED_IMMORTAL 2
+
 #define PyUnicode_Check(op) \
                  PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
 #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
@@ -529,6 +536,14 @@
 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
 
+PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
+PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
+PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
+PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
+
+/* Use only if you know it's a string */
+#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H

Modified: python/branches/py3k-struni/Modules/main.c
==============================================================================
--- python/branches/py3k-struni/Modules/main.c	(original)
+++ python/branches/py3k-struni/Modules/main.c	Fri May 25 15:52:07 2007
@@ -521,7 +521,7 @@
 #ifdef __INSURE__
 	/* Insure++ is a memory analysis tool that aids in discovering
 	 * memory leaks and other memory problems.  On Python exit, the
-	 * interned string dictionary is flagged as being in use at exit
+	 * interned string dictionaries are flagged as being in use at exit
 	 * (which it is).  Under normal circumstances, this is fine because
 	 * the memory will be automatically reclaimed by the system.  Under
 	 * memory debugging, it's a huge source of useless noise, so we
@@ -529,6 +529,7 @@
 	 * reports.  -baw
 	 */
 	_Py_ReleaseInternedStrings();
+	_Py_ReleaseInternedUnicodeStrings();
 #endif /* __INSURE__ */
 
 	return sts;

Modified: python/branches/py3k-struni/Objects/boolobject.c
==============================================================================
--- python/branches/py3k-struni/Objects/boolobject.c	(original)
+++ python/branches/py3k-struni/Objects/boolobject.c	Fri May 25 15:52:07 2007
@@ -24,10 +24,10 @@
 
 	if (self == Py_True)
 		s = true_str ? true_str :
-			(true_str = PyUnicode_FromString("True"));
+			(true_str = PyUnicode_InternFromString("True"));
 	else
 		s = false_str ? false_str :
-			(false_str = PyUnicode_FromString("False"));
+			(false_str = PyUnicode_InternFromString("False"));
 	Py_XINCREF(s);
 	return s;
 }

Modified: python/branches/py3k-struni/Objects/unicodeobject.c
==============================================================================
--- python/branches/py3k-struni/Objects/unicodeobject.c	(original)
+++ python/branches/py3k-struni/Objects/unicodeobject.c	Fri May 25 15:52:07 2007
@@ -92,6 +92,16 @@
 extern "C" {
 #endif
 
+/* This dictionary holds all interned unicode strings.  Note that references
+   to strings in this dictionary are *not* counted in the string's ob_refcnt.
+   When the interned string reaches a refcnt of 0 the string deallocation
+   function will delete the reference from this dictionary.
+
+   Another way to look at this is that to say that the actual reference
+   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
+*/
+static PyObject *interned;
+
 /* Free list for Unicode objects */
 static PyUnicodeObject *unicode_freelist;
 static int unicode_freelist_size;
@@ -276,6 +286,7 @@
     unicode->str[length] = 0;
     unicode->length = length;
     unicode->hash = -1;
+    unicode->state = 0;
     unicode->defenc = NULL;
     return unicode;
 
@@ -288,6 +299,25 @@
 static
 void unicode_dealloc(register PyUnicodeObject *unicode)
 {
+    switch (PyUnicode_CHECK_INTERNED(unicode)) {
+        case SSTATE_NOT_INTERNED:
+            break;
+
+        case SSTATE_INTERNED_MORTAL:
+            /* revive dead object temporarily for DelItem */
+            unicode->ob_refcnt = 3;
+            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
+                Py_FatalError(
+                    "deletion of interned unicode string failed");
+            break;
+
+        case SSTATE_INTERNED_IMMORTAL:
+            Py_FatalError("Immortal interned unicode string died.");
+
+        default:
+            Py_FatalError("Inconsistent interned unicode string state.");
+    }
+
     if (PyUnicode_CheckExact(unicode) &&
 	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
         /* Keep-Alive optimization */
@@ -8564,6 +8594,115 @@
     unicode_freelist_size = 0;
 }
 
+void
+PyUnicode_InternInPlace(PyObject **p)
+{
+	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
+	PyObject *t;
+	if (s == NULL || !PyUnicode_Check(s))
+		Py_FatalError(
+		    "PyUnicode_InternInPlace: unicode strings only please!");
+	/* If it's a subclass, we don't really know what putting
+	   it in the interned dict might do. */
+	if (!PyUnicode_CheckExact(s))
+		return;
+	if (PyUnicode_CHECK_INTERNED(s))
+		return;
+	if (interned == NULL) {
+		interned = PyDict_New();
+		if (interned == NULL) {
+			PyErr_Clear(); /* Don't leave an exception */
+			return;
+		}
+	}
+	t = PyDict_GetItem(interned, (PyObject *)s);
+	if (t) {
+		Py_INCREF(t);
+		Py_DECREF(*p);
+		*p = t;
+		return;
+	}
+
+	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
+		PyErr_Clear();
+		return;
+	}
+	/* The two references in interned are not counted by refcnt.
+	   The deallocator will take care of this */
+	s->ob_refcnt -= 2;
+	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
+}
+
+void
+PyUnicode_InternImmortal(PyObject **p)
+{
+	PyUnicode_InternInPlace(p);
+	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
+		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
+		Py_INCREF(*p);
+	}
+}
+
+PyObject *
+PyUnicode_InternFromString(const char *cp)
+{
+	PyObject *s = PyUnicode_FromString(cp);
+	if (s == NULL)
+		return NULL;
+	PyUnicode_InternInPlace(&s);
+	return s;
+}
+
+void _Py_ReleaseInternedUnicodeStrings(void)
+{
+	PyObject *keys;
+	PyUnicodeObject *s;
+	Py_ssize_t i, n;
+	Py_ssize_t immortal_size = 0, mortal_size = 0;
+
+	if (interned == NULL || !PyDict_Check(interned))
+		return;
+	keys = PyDict_Keys(interned);
+	if (keys == NULL || !PyList_Check(keys)) {
+		PyErr_Clear();
+		return;
+	}
+
+	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
+	   detector, interned unicode strings are not forcibly deallocated;
+	   rather, we give them their stolen references back, and then clear
+	   and DECREF the interned dict. */
+
+	n = PyList_GET_SIZE(keys);
+	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
+		n);
+	for (i = 0; i < n; i++) {
+		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
+		switch (s->state) {
+		case SSTATE_NOT_INTERNED:
+			/* XXX Shouldn't happen */
+			break;
+		case SSTATE_INTERNED_IMMORTAL:
+			s->ob_refcnt += 1;
+			immortal_size += s->length;
+			break;
+		case SSTATE_INTERNED_MORTAL:
+			s->ob_refcnt += 2;
+			mortal_size += s->length;
+			break;
+		default:
+			Py_FatalError("Inconsistent interned string state.");
+		}
+		s->state = SSTATE_NOT_INTERNED;
+	}
+	fprintf(stderr, "total size of all interned strings: "
+			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
+			"mortal/immortal\n", mortal_size, immortal_size);
+	Py_DECREF(keys);
+	PyDict_Clear(interned);
+	Py_DECREF(interned);
+	interned = NULL;
+}
 
 
 /********************* Unicode Iterator **************************/


More information about the Python-3000-checkins mailing list