[Python-checkins] gh-96735: Fix undefined behaviour in struct unpacking functions (#96739)

mdickinson webhook-mailer at python.org
Sun Sep 25 05:55:52 EDT 2022


https://github.com/python/cpython/commit/f5f047aa628caeca680745c55e24519f06aa6724
commit: f5f047aa628caeca680745c55e24519f06aa6724
branch: main
author: Mark Dickinson <dickinsm at gmail.com>
committer: mdickinson <dickinsm at gmail.com>
date: 2022-09-25T10:55:33+01:00
summary:

gh-96735: Fix undefined behaviour in struct unpacking functions (#96739)

This PR fixes undefined behaviour in the struct module unpacking support functions `bu_longlong`, `lu_longlong`, `bu_int` and `lu_int`; thanks to @kumaraditya303 for finding these.

The fix is to accumulate the bytes in an unsigned integer type instead of a signed integer type, then to convert to the appropriate signed type. In cases where the width matches, that conversion will typically be compiled away to a no-op.
(Evidence from Godbolt: https://godbolt.org/z/5zvxodj64 .)

To make the conversions efficient, I've specialised the relevant functions for their output size: for `bu_longlong` and `lu_longlong`, this only entails checking that the output size is indeed `8`. But `bu_int` and `lu_int` were used for format sizes `2` and `4` - I've split those into two separate functions each.

No tests, because all of the affected cases are already exercised by the test suite.

files:
A Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst
M Modules/_struct.c

diff --git a/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst
new file mode 100644
index 000000000000..a29ada98ac20
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst
@@ -0,0 +1 @@
+Fix undefined behaviour in :func:`struct.unpack`.
diff --git a/Modules/_struct.c b/Modules/_struct.c
index 9d66568a2826..09f52a981485 100644
--- a/Modules/_struct.c
+++ b/Modules/_struct.c
@@ -805,19 +805,38 @@ static const formatdef native_table[] = {
 
 /* Big-endian routines. *****************************************************/
 
+static PyObject *
+bu_short(_structmodulestate *state, const char *p, const formatdef *f)
+{
+    unsigned long x = 0;
+
+    /* This function is only ever used in the case f->size == 2. */
+    assert(f->size == 2);
+    Py_ssize_t i = 2;
+    const unsigned char *bytes = (const unsigned char *)p;
+    do {
+        x = (x<<8) | *bytes++;
+    } while (--i > 0);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x8000U) - 0x8000U;
+    return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x);
+}
+
 static PyObject *
 bu_int(_structmodulestate *state, const char *p, const formatdef *f)
 {
-    long x = 0;
-    Py_ssize_t i = f->size;
+    unsigned long x = 0;
+
+    /* This function is only ever used in the case f->size == 4. */
+    assert(f->size == 4);
+    Py_ssize_t i = 4;
     const unsigned char *bytes = (const unsigned char *)p;
     do {
         x = (x<<8) | *bytes++;
     } while (--i > 0);
-    /* Extend the sign bit. */
-    if (SIZEOF_LONG > f->size)
-        x |= -(x & (1L << ((8 * f->size) - 1)));
-    return PyLong_FromLong(x);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x80000000U) - 0x80000000U;
+    return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x);
 }
 
 static PyObject *
@@ -835,16 +854,19 @@ bu_uint(_structmodulestate *state, const char *p, const formatdef *f)
 static PyObject *
 bu_longlong(_structmodulestate *state, const char *p, const formatdef *f)
 {
-    long long x = 0;
-    Py_ssize_t i = f->size;
+    unsigned long long x = 0;
+
+    /* This function is only ever used in the case f->size == 8. */
+    assert(f->size == 8);
+    Py_ssize_t i = 8;
     const unsigned char *bytes = (const unsigned char *)p;
     do {
         x = (x<<8) | *bytes++;
     } while (--i > 0);
-    /* Extend the sign bit. */
-    if (SIZEOF_LONG_LONG > f->size)
-        x |= -(x & ((long long)1 << ((8 * f->size) - 1)));
-    return PyLong_FromLongLong(x);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x8000000000000000U) - 0x8000000000000000U;
+    return PyLong_FromLongLong(
+        x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x);
 }
 
 static PyObject *
@@ -1009,7 +1031,7 @@ static formatdef bigendian_table[] = {
     {'c',       1,              0,              nu_char,        np_char},
     {'s',       1,              0,              NULL},
     {'p',       1,              0,              NULL},
-    {'h',       2,              0,              bu_int,         bp_int},
+    {'h',       2,              0,              bu_short,       bp_int},
     {'H',       2,              0,              bu_uint,        bp_uint},
     {'i',       4,              0,              bu_int,         bp_int},
     {'I',       4,              0,              bu_uint,        bp_uint},
@@ -1026,19 +1048,38 @@ static formatdef bigendian_table[] = {
 
 /* Little-endian routines. *****************************************************/
 
+static PyObject *
+lu_short(_structmodulestate *state, const char *p, const formatdef *f)
+{
+    unsigned long x = 0;
+
+    /* This function is only ever used in the case f->size == 2. */
+    assert(f->size == 2);
+    Py_ssize_t i = 2;
+    const unsigned char *bytes = (const unsigned char *)p;
+    do {
+        x = (x<<8) | bytes[--i];
+    } while (i > 0);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x8000U) - 0x8000U;
+    return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x);
+}
+
 static PyObject *
 lu_int(_structmodulestate *state, const char *p, const formatdef *f)
 {
-    long x = 0;
-    Py_ssize_t i = f->size;
+    unsigned long x = 0;
+
+    /* This function is only ever used in the case f->size == 4. */
+    assert(f->size == 4);
+    Py_ssize_t i = 4;
     const unsigned char *bytes = (const unsigned char *)p;
     do {
         x = (x<<8) | bytes[--i];
     } while (i > 0);
-    /* Extend the sign bit. */
-    if (SIZEOF_LONG > f->size)
-        x |= -(x & (1L << ((8 * f->size) - 1)));
-    return PyLong_FromLong(x);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x80000000U) - 0x80000000U;
+    return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x);
 }
 
 static PyObject *
@@ -1056,16 +1097,19 @@ lu_uint(_structmodulestate *state, const char *p, const formatdef *f)
 static PyObject *
 lu_longlong(_structmodulestate *state, const char *p, const formatdef *f)
 {
-    long long x = 0;
-    Py_ssize_t i = f->size;
+    unsigned long long x = 0;
+
+    /* This function is only ever used in the case f->size == 8. */
+    assert(f->size == 8);
+    Py_ssize_t i = 8;
     const unsigned char *bytes = (const unsigned char *)p;
     do {
         x = (x<<8) | bytes[--i];
     } while (i > 0);
-    /* Extend the sign bit. */
-    if (SIZEOF_LONG_LONG > f->size)
-        x |= -(x & ((long long)1 << ((8 * f->size) - 1)));
-    return PyLong_FromLongLong(x);
+    /* Extend sign, avoiding implementation-defined or undefined behaviour. */
+    x = (x ^ 0x8000000000000000U) - 0x8000000000000000U;
+    return PyLong_FromLongLong(
+        x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x);
 }
 
 static PyObject *
@@ -1213,7 +1257,7 @@ static formatdef lilendian_table[] = {
     {'c',       1,              0,              nu_char,        np_char},
     {'s',       1,              0,              NULL},
     {'p',       1,              0,              NULL},
-    {'h',       2,              0,              lu_int,         lp_int},
+    {'h',       2,              0,              lu_short,       lp_int},
     {'H',       2,              0,              lu_uint,        lp_uint},
     {'i',       4,              0,              lu_int,         lp_int},
     {'I',       4,              0,              lu_uint,        lp_uint},



More information about the Python-checkins mailing list