[Python-checkins] bpo-45061: Detect refcount bug on empty string singleton (GH-28504)

Tue Sep 21 17:43:17 EDT 2021

https://github.com/python/cpython/commit/86f28372b17c8c56539e9543bea9f125ab11b8aa
commit: 86f28372b17c8c56539e9543bea9f125ab11b8aa
branch: main
author: Victor Stinner <vstinner at python.org>
committer: vstinner <vstinner at python.org>
date: 2021-09-21T23:43:09+02:00
summary:

bpo-45061: Detect refcount bug on empty string singleton (GH-28504)

Detect refcount bugs in C extensions when the empty Unicode string
singleton is destroyed by mistake.

* Move forward declarations to the top of unicodeobject.c.
* Simplifiy unicode_is_singleton().

files:
M Misc/NEWS.d/next/Core and Builtins/2021-09-21-22-27-25.bpo-45061.5IOUf0.rst
M Objects/unicodeobject.c

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-09-21-22-27-25.bpo-45061.5IOUf0.rst b/Misc/NEWS.d/next/Core and Builtins/2021-09-21-22-27-25.bpo-45061.5IOUf0.rst
index 08924531dc3f1..caeb36ba52646 100644
--- a/Misc/NEWS.d/next/Core and Builtins/2021-09-21-22-27-25.bpo-45061.5IOUf0.rst	
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-09-21-22-27-25.bpo-45061.5IOUf0.rst	
@@ -1,4 +1,5 @@
 Add a deallocator to the bool type to detect refcount bugs in C extensions
 which call Py_DECREF(Py_True) or Py_DECREF(Py_False) by mistake. Detect also
-refcount bugs when the empty tuple singleton is destroyed by mistake. Patch
-by Victor Stinner.
+refcount bugs when the empty tuple singleton or the Unicode empty string
+singleton is destroyed by mistake.
+Patch by Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 3e6b70bf4b6f5..9b0b8694bfdf6 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -48,6 +48,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "pycore_interp.h"        // PyInterpreterState.fs_codec
 #include "pycore_object.h"        // _PyObject_GC_TRACK()
 #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
+#include "pycore_pyerrors.h"      // _Py_FatalRefcountError()
 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
@@ -212,6 +213,24 @@ extern "C" {
 #endif
 
 
+/* Forward declaration */
+static inline int
+_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
+static inline void
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
+static PyObject *
+unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
+                    const char *errors);
+static PyObject *
+unicode_decode_utf8(const char *s, Py_ssize_t size,
+                    _Py_error_handler error_handler, const char *errors,
+                    Py_ssize_t *consumed);
+#ifdef Py_DEBUG
+static inline int unicode_is_finalizing(void);
+static int unicode_is_singleton(PyObject *unicode);
+#endif
+
+
 static struct _Py_unicode_state*
 get_unicode_state(void)
 {
@@ -279,19 +298,6 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
 }
 
 
-/* Forward declaration */
-static inline int
-_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
-static inline void
-_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
-static PyObject *
-unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
-                    const char *errors);
-static PyObject *
-unicode_decode_utf8(const char *s, Py_ssize_t size,
-                    _Py_error_handler error_handler, const char *errors,
-                    Py_ssize_t *consumed);
-
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
@@ -1930,6 +1936,12 @@ _PyUnicode_Ready(PyObject *unicode)
 static void
 unicode_dealloc(PyObject *unicode)
 {
+#ifdef Py_DEBUG
+    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
+        _Py_FatalRefcountError("deallocating an Unicode singleton");
+    }
+#endif
+
     switch (PyUnicode_CHECK_INTERNED(unicode)) {
     case SSTATE_NOT_INTERNED:
         break;
@@ -1982,11 +1994,8 @@ unicode_is_singleton(PyObject *unicode)
     if (unicode == state->empty_string) {
         return 1;
     }
-    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
-    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
-    {
-        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
-        if (ch < 256 && state->latin1[ch] == unicode) {
+    for (Py_ssize_t i = 0; i < 256; i++) {
+        if (unicode == state->latin1[i]) {
             return 1;
         }
     }
@@ -15984,6 +15993,16 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
 #endif
 
 
+#ifdef Py_DEBUG
+static inline int
+unicode_is_finalizing(void)
+{
+    struct _Py_unicode_state *state = get_unicode_state();
+    return (state->interned == NULL);
+}
+#endif
+
+
 void
 _PyUnicode_Fini(PyInterpreterState *interp)
 {