pythongh-107137: Add _PyTuple_NewNoTrack() internal C API

Fix a crash in PySequence_Tuple() when the tuple which is being initialized was accessed via GC introspection, like the gc.get_referrers() function. Now the function only tracks the tuple by the GC once it's fully initialized. Add _PyTuple_NewNoTrack() and _PyTuple_ResizeNoTrack() functions to the internal C API: similar to PyTuple_New() and _PyTuple_Resize(), but don't track the tuple by the GC. Modify PySequence_Tuple() to use these functions.
vstinner · Jul 24, 2023 · 9c9f4a7 · 9c9f4a7
1 parent 0a9b339
commit 9c9f4a7
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 29 deletions.
diff --git a/Include/internal/pycore_tuple.h b/Include/internal/pycore_tuple.h
@@ -11,6 +11,11 @@ extern "C" {
 extern void _PyTuple_MaybeUntrack(PyObject *);
 extern void _PyTuple_DebugMallocStats(FILE *out);
 
+// Similar to PyTuple_New() and _PyTuple_Resize(), but don't track the tuple
+// by the GC.
+extern PyObject* _PyTuple_NewNoTrack(Py_ssize_t size);
+extern int _PyTuple_ResizeNoTrack(PyObject **pv, Py_ssize_t newsize);
+
 /* runtime lifecycle */
 
 extern PyStatus _PyTuple_InitGlobalObjects(PyInterpreterState *);

diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py
@@ -416,6 +416,29 @@ def test_lexicographic_ordering(self):
         self.assertLess(a, b)
         self.assertLess(b, c)
 
+    def test_sequence_tuple(self):
+        # gh-39117, gh-59313, gh-107137: PySequence_Tuple() must not track
+        # the tuple by the GC before it's fully initialized.
+
+        # TAG object used to find the tuple in objects tracked by the GC
+        TAG = object()
+
+        def my_iter():
+            yield TAG
+
+            for ref in gc.get_referrers(TAG):
+                if isinstance(ref, tuple):
+                    raise Exception("PySequence_Tuple() leaks the tuple")
+
+            # Add 100 items to trigger _PyTuple_Resize() calls.
+            # PySequence_Tuple() started by allocating a tuple of 10 items.
+            for num in range(100):
+                yield num
+
+        seq = tuple(my_iter())
+        self.assertEqual(seq, (TAG, *range(100)))
+
+
 # Notes on testing hash codes.  The primary thing is that Python doesn't
 # care about "random" hash codes.  To the contrary, we like them to be
 # very regular when possible, so that the low-order bits are as evenly

diff --git a/Misc/NEWS.d/next/C API/2023-07-24-18-51-27.gh-issue-107137.zk8it0.rst b/Misc/NEWS.d/next/C API/2023-07-24-18-51-27.gh-issue-107137.zk8it0.rst
@@ -0,0 +1,4 @@
+Fix a crash in :c:func:`PySequence_Tuple` when the tuple which is being
+initialized was accessed via GC introspection, like the
+:func:`gc.get_referrers` function. Now the function only tracks the tuple by
+the GC once it's fully initialized. Patch by Victor Stinner.
diff --git a/Objects/abstract.c b/Objects/abstract.c
@@ -8,6 +8,7 @@
 #include "pycore_long.h"          // _Py_IsNegative
 #include "pycore_pyerrors.h"      // _PyErr_Occurred()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
+#include "pycore_tuple.h"         // _PyTuple_NewNoTrack()
 #include "pycore_unionobject.h"   // _PyUnion_Check()
 #include <ctype.h>
 #include <stddef.h>               // offsetof()
@@ -2074,11 +2075,6 @@ PySequence_DelSlice(PyObject *s, Py_ssize_t i1, Py_ssize_t i2)
 PyObject *
 PySequence_Tuple(PyObject *v)
 {
-    PyObject *it;  /* iter(v) */
-    Py_ssize_t n;             /* guess for result tuple size */
-    PyObject *result = NULL;
-    Py_ssize_t j;
-
     if (v == NULL) {
         return null_error();
     }
@@ -2091,28 +2087,36 @@ PySequence_Tuple(PyObject *v)
            a copy, so there's no need for exactness below. */
         return Py_NewRef(v);
     }
-    if (PyList_CheckExact(v))
+    if (PyList_CheckExact(v)) {
         return PyList_AsTuple(v);
+    }
 
     /* Get iterator. */
-    it = PyObject_GetIter(v);
-    if (it == NULL)
+    PyObject *it = PyObject_GetIter(v);  // iter(v)
+    if (it == NULL) {
         return NULL;
+    }
 
     /* Guess result size and allocate space. */
-    n = PyObject_LengthHint(v, 10);
-    if (n == -1)
-        goto Fail;
-    result = PyTuple_New(n);
-    if (result == NULL)
+    Py_ssize_t n = PyObject_LengthHint(v, 10);  // guess for result tuple size
+    if (n == -1) {
+        Py_DECREF(it);
+        return NULL;
+    }
+
+    PyObject *result = _PyTuple_NewNoTrack(n);
+    if (result == NULL) {
         goto Fail;
+    }
 
     /* Fill the tuple. */
-    for (j = 0; ; ++j) {
+    Py_ssize_t j = 0;
+    for (; ; ++j) {
         PyObject *item = PyIter_Next(it);
         if (item == NULL) {
-            if (PyErr_Occurred())
+            if (PyErr_Occurred()) {
                 goto Fail;
+            }
             break;
         }
         if (j >= n) {
@@ -2132,7 +2136,7 @@ PySequence_Tuple(PyObject *v)
                 goto Fail;
             }
             n = (Py_ssize_t)newn;
-            if (_PyTuple_Resize(&result, n) != 0) {
+            if (_PyTuple_ResizeNoTrack(&result, n) != 0) {
                 Py_DECREF(item);
                 goto Fail;
             }
@@ -2141,11 +2145,15 @@ PySequence_Tuple(PyObject *v)
     }
 
     /* Cut tuple back if guess was too large. */
-    if (j < n &&
-        _PyTuple_Resize(&result, j) != 0)
+    if (j < n && _PyTuple_ResizeNoTrack(&result, j) != 0) {
         goto Fail;
-
+    }
     Py_DECREF(it);
+
+    // No need to track the empty tuple singleton
+    if (j != 0) {
+        PyObject_GC_Track(result);
+    }
     return result;
 
 Fail:

diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
@@ -65,24 +65,38 @@ tuple_get_empty(void)
     return Py_NewRef(&_Py_SINGLETON(tuple_empty));
 }
 
-PyObject *
-PyTuple_New(Py_ssize_t size)
+
+static inline PyObject *
+tuple_new_capi(Py_ssize_t size, int track)
 {
-    PyTupleObject *op;
     if (size == 0) {
         return tuple_get_empty();
     }
-    op = tuple_alloc(size);
+    PyTupleObject *op = tuple_alloc(size);
     if (op == NULL) {
         return NULL;
     }
     for (Py_ssize_t i = 0; i < size; i++) {
         op->ob_item[i] = NULL;
     }
-    _PyObject_GC_TRACK(op);
+    if (track) {
+        _PyObject_GC_TRACK(op);
+    }
     return (PyObject *) op;
 }
 
+PyObject *
+_PyTuple_NewNoTrack(Py_ssize_t size)
+{
+    return tuple_new_capi(size, 0);
+}
+
+PyObject *
+PyTuple_New(Py_ssize_t size)
+{
+    return tuple_new_capi(size, 1);
+}
+
 Py_ssize_t
 PyTuple_Size(PyObject *op)
 {
@@ -894,8 +908,8 @@ PyTypeObject PyTuple_Type = {
    efficiently.  In any case, don't use this if the tuple may already be
    known to some other part of the code. */
 
-int
-_PyTuple_Resize(PyObject **pv, Py_ssize_t newsize)
+static int
+tuple_resize(PyObject **pv, Py_ssize_t newsize, int track)
 {
     PyTupleObject *v;
     PyTupleObject *sv;
@@ -927,7 +941,12 @@ _PyTuple_Resize(PyObject **pv, Py_ssize_t newsize)
         /* The empty tuple is statically allocated so we never
            resize it in-place. */
         Py_DECREF(v);
-        *pv = PyTuple_New(newsize);
+        if (track) {
+            *pv = PyTuple_New(newsize);
+        }
+        else {
+            *pv = _PyTuple_NewNoTrack(newsize);
+        }
         return *pv == NULL ? -1 : 0;
     }
 
@@ -952,14 +971,29 @@ _PyTuple_Resize(PyObject **pv, Py_ssize_t newsize)
     }
     _Py_NewReferenceNoTotal((PyObject *) sv);
     /* Zero out items added by growing */
-    if (newsize > oldsize)
+    if (newsize > oldsize) {
         memset(&sv->ob_item[oldsize], 0,
                sizeof(*sv->ob_item) * (newsize - oldsize));
+    }
     *pv = (PyObject *) sv;
-    _PyObject_GC_TRACK(sv);
+    if (track) {
+        _PyObject_GC_TRACK(*pv);
+    }
     return 0;
 }
 
+int
+_PyTuple_ResizeNoTrack(PyObject **pv, Py_ssize_t newsize)
+{
+    return tuple_resize(pv, newsize, 0);
+}
+
+int
+_PyTuple_Resize(PyObject **pv, Py_ssize_t newsize)
+{
+    return tuple_resize(pv, newsize, 1);
+}
+
 
 static void maybe_freelist_clear(PyInterpreterState *, int);