[Python-checkins] bpo-47256: Increasing the depth of backtracking in RE (GH-32411)

serhiy-storchaka webhook-mailer at python.org
Mon Apr 18 09:50:47 EDT 2022


https://github.com/python/cpython/commit/a29f858124bc698f6604716b73306c65b63b5054
commit: a29f858124bc698f6604716b73306c65b63b5054
branch: main
author: Ma Lin <animalize at users.noreply.github.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-04-18T16:50:40+03:00
summary:

bpo-47256: Increasing the depth of backtracking in RE (GH-32411)

Limit the maximum capturing group to 2**30-1 on 64-bit platforms
(it was 2**31-1). No change on 32-bit platforms (2**28-1).

It allows to reduce the size of SRE(match_context):
- On 32 bit platform: 36 bytes, no change.  (msvc2022)
- On 64 bit platform: 72 bytes -> 56 bytes. (msvc2022/gcc9.4)

which leads to increasing the depth of backtracking.

files:
A Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst
M Modules/_sre/sre.h
M Modules/_sre/sre_lib.h

diff --git a/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst b/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst
new file mode 100644
index 0000000000000..ac4c52bd7058a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst
@@ -0,0 +1,2 @@
+:mod:`re` module, limit the maximum capturing group to 1,073,741,823 in
+64-bit build, this increases the depth of backtracking.
diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h
index 129f5595269f5..aff064d343ec4 100644
--- a/Modules/_sre/sre.h
+++ b/Modules/_sre/sre.h
@@ -18,10 +18,10 @@
 #define SRE_CODE Py_UCS4
 #if SIZEOF_SIZE_T > 4
 # define SRE_MAXREPEAT (~(SRE_CODE)0)
-# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
+# define SRE_MAXGROUPS ((SRE_CODE)INT32_MAX / 2)
 #else
 # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
-# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
+# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_VOID_P / 2)
 #endif
 
 typedef struct {
@@ -73,12 +73,12 @@ typedef struct {
     Py_ssize_t pos, endpos;
     int isbytes;
     int charsize; /* character size */
-    /* registers */
-    Py_ssize_t lastindex;
-    Py_ssize_t lastmark;
-    const void** mark;
     int match_all;
     int must_advance;
+    /* marks */
+    int lastmark;
+    int lastindex;
+    const void** mark;
     /* dynamically allocated stuff */
     char* data_stack;
     size_t data_stack_size;
diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h
index 3472e65b87ae6..db624aa896d6a 100644
--- a/Modules/_sre/sre_lib.h
+++ b/Modules/_sre/sre_lib.h
@@ -450,20 +450,23 @@ do { \
 
 #define MARK_PUSH(lastmark) \
     do if (lastmark >= 0) { \
-        i = lastmark; /* ctx->lastmark may change if reallocated */ \
-        DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
+        size_t _marks_size = (lastmark+1) * sizeof(void*); \
+        DATA_STACK_PUSH(state, state->mark, _marks_size); \
     } while (0)
 #define MARK_POP(lastmark) \
     do if (lastmark >= 0) { \
-        DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
+        size_t _marks_size = (lastmark+1) * sizeof(void*); \
+        DATA_STACK_POP(state, state->mark, _marks_size, 1); \
     } while (0)
 #define MARK_POP_KEEP(lastmark) \
     do if (lastmark >= 0) { \
-        DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
+        size_t _marks_size = (lastmark+1) * sizeof(void*); \
+        DATA_STACK_POP(state, state->mark, _marks_size, 0); \
     } while (0)
 #define MARK_POP_DISCARD(lastmark) \
     do if (lastmark >= 0) { \
-        DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
+        size_t _marks_size = (lastmark+1) * sizeof(void*); \
+        DATA_STACK_POP_DISCARD(state, _marks_size); \
     } while (0)
 
 #define JUMP_NONE            0
@@ -488,10 +491,10 @@ do { \
     ctx->pattern = pattern; \
     ctx->ptr = ptr; \
     DATA_ALLOC(SRE(match_context), nextctx); \
-    nextctx->last_ctx_pos = ctx_pos; \
-    nextctx->jump = jumpvalue; \
     nextctx->pattern = nextpattern; \
     nextctx->toplevel = toplevel_; \
+    nextctx->jump = jumpvalue; \
+    nextctx->last_ctx_pos = ctx_pos; \
     pattern = nextpattern; \
     ctx_pos = alloc_pos; \
     ctx = nextctx; \
@@ -507,18 +510,18 @@ do { \
     DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
 
 typedef struct {
-    Py_ssize_t last_ctx_pos;
-    Py_ssize_t jump;
-    const SRE_CHAR* ptr;
-    const SRE_CODE* pattern;
     Py_ssize_t count;
-    Py_ssize_t lastmark;
-    Py_ssize_t lastindex;
     union {
         SRE_CODE chr;
         SRE_REPEAT* rep;
     } u;
+    int lastmark;
+    int lastindex;
+    const SRE_CODE* pattern;
+    const SRE_CHAR* ptr;
     int toplevel;
+    int jump;
+    Py_ssize_t last_ctx_pos;
 } SRE(match_context);
 
 #define MAYBE_CHECK_SIGNALS                                        \
@@ -558,8 +561,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
 {
     const SRE_CHAR* end = (const SRE_CHAR *)state->end;
     Py_ssize_t alloc_pos, ctx_pos = -1;
-    Py_ssize_t i, ret = 0;
-    Py_ssize_t jump;
+    Py_ssize_t ret = 0;
+    int jump;
     unsigned int sigcount=0;
 
     SRE(match_context)* ctx;
@@ -607,20 +610,22 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* <MARK> <gid> */
             TRACE(("|%p|%p|MARK %d\n", pattern,
                    ptr, pattern[0]));
-            i = pattern[0];
-            if (i & 1)
-                state->lastindex = i/2 + 1;
-            if (i > state->lastmark) {
-                /* state->lastmark is the highest valid index in the
-                   state->mark array.  If it is increased by more than 1,
-                   the intervening marks must be set to NULL to signal
-                   that these marks have not been encountered. */
-                Py_ssize_t j = state->lastmark + 1;
-                while (j < i)
-                    state->mark[j++] = NULL;
-                state->lastmark = i;
+            {
+                int i = pattern[0];
+                if (i & 1)
+                    state->lastindex = i/2 + 1;
+                if (i > state->lastmark) {
+                    /* state->lastmark is the highest valid index in the
+                       state->mark array.  If it is increased by more than 1,
+                       the intervening marks must be set to NULL to signal
+                       that these marks have not been encountered. */
+                    int j = state->lastmark + 1;
+                    while (j < i)
+                        state->mark[j++] = NULL;
+                    state->lastmark = i;
+                }
+                state->mark[i] = ptr;
             }
-            state->mark[i] = ptr;
             pattern++;
             DISPATCH;
 
@@ -1373,9 +1378,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* match backreference */
             TRACE(("|%p|%p|GROUPREF %d\n", pattern,
                    ptr, pattern[0]));
-            i = pattern[0];
             {
-                Py_ssize_t groupref = i+i;
+                int groupref = pattern[0] * 2;
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
@@ -1398,9 +1402,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* match backreference */
             TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern,
                    ptr, pattern[0]));
-            i = pattern[0];
             {
-                Py_ssize_t groupref = i+i;
+                int groupref = pattern[0] * 2;
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
@@ -1424,9 +1427,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* match backreference */
             TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern,
                    ptr, pattern[0]));
-            i = pattern[0];
             {
-                Py_ssize_t groupref = i+i;
+                int groupref = pattern[0] * 2;
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
@@ -1450,9 +1452,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             /* match backreference */
             TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern,
                    ptr, pattern[0]));
-            i = pattern[0];
             {
-                Py_ssize_t groupref = i+i;
+                int groupref = pattern[0] * 2;
                 if (groupref >= state->lastmark) {
                     RETURN_FAILURE;
                 } else {
@@ -1476,9 +1477,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern,
                    ptr, pattern[0]));
             /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
-            i = pattern[0];
             {
-                Py_ssize_t groupref = i+i;
+                int groupref = pattern[0] * 2;
                 if (groupref >= state->lastmark) {
                     pattern += pattern[1];
                     DISPATCH;



More information about the Python-checkins mailing list