[Python-checkins] [3.12] gh-105235: Prevent reading outside buffer during mmap.find() (GH-105252) (#106708)

sweeneyde webhook-mailer at python.org
Fri Jul 14 22:15:17 EDT 2023


https://github.com/python/cpython/commit/4f3edd6b535b6a0b7352df134c0f445ab279bfc0
commit: 4f3edd6b535b6a0b7352df134c0f445ab279bfc0
branch: 3.12
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: sweeneyde <36520290+sweeneyde at users.noreply.github.com>
date: 2023-07-14T22:15:14-04:00
summary:

[3.12] gh-105235: Prevent reading outside buffer during mmap.find() (GH-105252) (#106708)

gh-105235: Prevent reading outside buffer during mmap.find() (GH-105252)

* Add a special case for s[-m:] == p in _PyBytes_Find

* Add tests for _PyBytes_Find

* Make sure that start <= end in mmap.find
(cherry picked from commit ab86426a3472ab68747815299d390b213793c3d1)

Co-authored-by: Dennis Sweeney <36520290+sweeneyde at users.noreply.github.com>

files:
A Misc/NEWS.d/next/Core and Builtins/2023-06-02-19-37-29.gh-issue-105235.fgFGTi.rst
M Lib/test/test_mmap.py
M Modules/_testinternalcapi.c
M Modules/mmapmodule.c
M Objects/bytesobject.c

diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py
index 517cbe0cb115a..bab868600895c 100644
--- a/Lib/test/test_mmap.py
+++ b/Lib/test/test_mmap.py
@@ -299,6 +299,27 @@ def test_find_end(self):
         self.assertEqual(m.find(b'one', 1, -2), -1)
         self.assertEqual(m.find(bytearray(b'one')), 0)
 
+        for i in range(-n-1, n+1):
+            for j in range(-n-1, n+1):
+                for p in [b"o", b"on", b"two", b"ones", b"s"]:
+                    expected = data.find(p, i, j)
+                    self.assertEqual(m.find(p, i, j), expected, (p, i, j))
+
+    def test_find_does_not_access_beyond_buffer(self):
+        try:
+            flags = mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS
+            PAGESIZE = mmap.PAGESIZE
+            PROT_NONE = 0
+            PROT_READ = mmap.PROT_READ
+        except AttributeError as e:
+            raise unittest.SkipTest("mmap flags unavailable") from e
+        for i in range(0, 2049):
+            with mmap.mmap(-1, PAGESIZE * (i + 1),
+                           flags=flags, prot=PROT_NONE) as guard:
+                with mmap.mmap(-1, PAGESIZE * (i + 2048),
+                               flags=flags, prot=PROT_READ) as fm:
+                    fm.find(b"fo", -2)
+
 
     def test_rfind(self):
         # test the new 'end' parameter works as expected
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-02-19-37-29.gh-issue-105235.fgFGTi.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-02-19-37-29.gh-issue-105235.fgFGTi.rst
new file mode 100644
index 0000000000000..c28d0101cd4ba
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-02-19-37-29.gh-issue-105235.fgFGTi.rst	
@@ -0,0 +1 @@
+Prevent out-of-bounds memory access during ``mmap.find()`` calls.
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 7e2def7c20b1b..98f99e4d66cf1 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -15,6 +15,7 @@
 #include "frameobject.h"
 #include "pycore_atomic_funcs.h" // _Py_atomic_int_get()
 #include "pycore_bitutils.h"     // _Py_bswap32()
+#include "pycore_bytesobject.h"  // _PyBytes_Find()
 #include "pycore_compile.h"      // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble
 #include "pycore_ceval.h"        // _PyEval_AddPendingCall
 #include "pycore_fileutils.h"    // _Py_normpath
@@ -441,6 +442,118 @@ test_edit_cost(PyObject *self, PyObject *Py_UNUSED(args))
 }
 
 
+static int
+check_bytes_find(const char *haystack0, const char *needle0,
+                 int offset, Py_ssize_t expected)
+{
+    Py_ssize_t len_haystack = strlen(haystack0);
+    Py_ssize_t len_needle = strlen(needle0);
+    Py_ssize_t result_1 = _PyBytes_Find(haystack0, len_haystack,
+                                        needle0, len_needle, offset);
+    if (result_1 != expected) {
+        PyErr_Format(PyExc_AssertionError,
+                    "Incorrect result_1: '%s' in '%s' (offset=%zd)",
+                    needle0, haystack0, offset);
+        return -1;
+    }
+    // Allocate new buffer with no NULL terminator.
+    char *haystack = PyMem_Malloc(len_haystack);
+    if (haystack == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    char *needle = PyMem_Malloc(len_needle);
+    if (needle == NULL) {
+        PyMem_Free(haystack);
+        PyErr_NoMemory();
+        return -1;
+    }
+    memcpy(haystack, haystack0, len_haystack);
+    memcpy(needle, needle0, len_needle);
+    Py_ssize_t result_2 = _PyBytes_Find(haystack, len_haystack,
+                                        needle, len_needle, offset);
+    PyMem_Free(haystack);
+    PyMem_Free(needle);
+    if (result_2 != expected) {
+        PyErr_Format(PyExc_AssertionError,
+                    "Incorrect result_2: '%s' in '%s' (offset=%zd)",
+                    needle0, haystack0, offset);
+        return -1;
+    }
+    return 0;
+}
+
+static int
+check_bytes_find_large(Py_ssize_t len_haystack, Py_ssize_t len_needle,
+                       const char *needle)
+{
+    char *zeros = PyMem_RawCalloc(len_haystack, 1);
+    if (zeros == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    Py_ssize_t res = _PyBytes_Find(zeros, len_haystack, needle, len_needle, 0);
+    PyMem_RawFree(zeros);
+    if (res != -1) {
+        PyErr_Format(PyExc_AssertionError,
+                    "check_bytes_find_large(%zd, %zd) found %zd",
+                    len_haystack, len_needle, res);
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *
+test_bytes_find(PyObject *self, PyObject *Py_UNUSED(args))
+{
+    #define CHECK(H, N, O, E) do {               \
+        if (check_bytes_find(H, N, O, E) < 0) {  \
+            return NULL;                         \
+        }                                        \
+    } while (0)
+
+    CHECK("", "", 0, 0);
+    CHECK("Python", "", 0, 0);
+    CHECK("Python", "", 3, 3);
+    CHECK("Python", "", 6, 6);
+    CHECK("Python", "yth", 0, 1);
+    CHECK("ython", "yth", 1, 1);
+    CHECK("thon", "yth", 2, -1);
+    CHECK("Python", "thon", 0, 2);
+    CHECK("ython", "thon", 1, 2);
+    CHECK("thon", "thon", 2, 2);
+    CHECK("hon", "thon", 3, -1);
+    CHECK("Pytho", "zz", 0, -1);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "ab", 0, -1);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "ba", 0, -1);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "bb", 0, -1);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", "ab", 0, 30);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", "ba", 0, 30);
+    CHECK("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaabb", "bb", 0, 30);
+    #undef CHECK
+
+    // Hunt for segfaults
+    // n, m chosen here so that (n - m) % (m + 1) == 0
+    // This would make default_find in fastsearch.h access haystack[n].
+    if (check_bytes_find_large(2048, 2, "ab") < 0) {
+        return NULL;
+    }
+    if (check_bytes_find_large(4096, 16, "0123456789abcdef") < 0) {
+        return NULL;
+    }
+    if (check_bytes_find_large(8192, 2, "ab") < 0) {
+        return NULL;
+    }
+    if (check_bytes_find_large(16384, 4, "abcd") < 0) {
+        return NULL;
+    }
+    if (check_bytes_find_large(32768, 2, "ab") < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
 static PyObject *
 normalize_path(PyObject *self, PyObject *filename)
 {
@@ -950,6 +1063,7 @@ static PyMethodDef module_functions[] = {
     {"reset_path_config", test_reset_path_config, METH_NOARGS},
     {"test_atomic_funcs", test_atomic_funcs, METH_NOARGS},
     {"test_edit_cost", test_edit_cost, METH_NOARGS},
+    {"test_bytes_find", test_bytes_find, METH_NOARGS},
     {"normalize_path", normalize_path, METH_O, NULL},
     {"get_getpath_codeobject", get_getpath_codeobject, METH_NOARGS, NULL},
     {"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
diff --git a/Modules/mmapmodule.c b/Modules/mmapmodule.c
index 6bde9939eaa2c..1ffadccb91625 100644
--- a/Modules/mmapmodule.c
+++ b/Modules/mmapmodule.c
@@ -343,12 +343,17 @@ mmap_gfind(mmap_object *self,
 
         Py_ssize_t res;
         CHECK_VALID_OR_RELEASE(NULL, view);
-        if (reverse) {
+        if (end < start) {
+            res = -1;
+        }
+        else if (reverse) {
+            assert(0 <= start && start <= end && end <= self->size);
             res = _PyBytes_ReverseFind(
                 self->data + start, end - start,
                 view.buf, view.len, start);
         }
         else {
+            assert(0 <= start && start <= end && end <= self->size);
             res = _PyBytes_Find(
                 self->data + start, end - start,
                 view.buf, view.len, start);
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index abbf3eeb16c35..f3a978c86c360 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1274,8 +1274,25 @@ _PyBytes_Find(const char *haystack, Py_ssize_t len_haystack,
               const char *needle, Py_ssize_t len_needle,
               Py_ssize_t offset)
 {
-    return stringlib_find(haystack, len_haystack,
-                          needle, len_needle, offset);
+    assert(len_haystack >= 0);
+    assert(len_needle >= 0);
+    // Extra checks because stringlib_find accesses haystack[len_haystack].
+    if (len_needle == 0) {
+        return offset;
+    }
+    if (len_needle > len_haystack) {
+        return -1;
+    }
+    assert(len_haystack >= 1);
+    Py_ssize_t res = stringlib_find(haystack, len_haystack - 1,
+                                    needle, len_needle, offset);
+    if (res == -1) {
+        Py_ssize_t last_align = len_haystack - len_needle;
+        if (memcmp(haystack + last_align, needle, len_needle) == 0) {
+            return offset + last_align;
+        }
+    }
+    return res;
 }
 
 Py_ssize_t



More information about the Python-checkins mailing list