[Python-checkins] gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)

Sat Apr 30 06:13:50 EDT 2022

https://github.com/python/cpython/commit/19dca041212f9f58ee11833bff3f8c157d4fd3e8
commit: 19dca041212f9f58ee11833bff3f8c157d4fd3e8
branch: main
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-04-30T13:13:46+03:00
summary:

gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)

Only sequence of ASCII digits will be accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.

files:
A Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst
M Doc/library/re.rst
M Doc/whatsnew/3.11.rst
M Lib/re/_parser.py
M Lib/test/test_re.py

diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 89de9286ace79..3cd9f252fee6f 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -417,6 +417,9 @@ The special characters are:
    |                                       | * ``\1``                         |
    +---------------------------------------+----------------------------------+
 
+   .. deprecated:: 3.11
+      Group names containing non-ASCII characters in bytes patterns.
+
 .. index:: single: (?P=; in regular expressions
 
 ``(?P=name)``
@@ -486,6 +489,9 @@ The special characters are:
    will match with ``'<user at host.com>'`` as well as ``'user at host.com'``, but
    not with ``'<user at host.com'`` nor ``'user at host.com>'``.
 
+   .. deprecated:: 3.11
+      Group *id* containing anything except ASCII digits.
+
 
 The special sequences consist of ``'\'`` and a character from the list below.
 If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1001,10 @@ form.
       Empty matches for the pattern are replaced when adjacent to a previous
       non-empty match.
 
+   .. deprecated:: 3.11
+      Group *id* containing anything except ASCII digits.
+      Group names containing non-ASCII characters in bytes replacement strings.
+
 
 .. function:: subn(pattern, repl, string, count=0, flags=0)
 
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index fdd35ce14317e..1a692f2fe7f60 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -1151,6 +1151,14 @@ Deprecated
   (Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
   :gh:`68966`.)
 
+* More strict rules will be applied now applied for numerical group references
+  and group names in regular expressions in future Python versions.
+  Only sequence of ASCII digits will be now accepted as a numerical reference.
+  The group name in bytes patterns and replacement strings could only
+  contain ASCII letters and digits and underscore.
+  For now, a deprecation warning is raised for such syntax.
+  (Contributed by Serhiy Storchaka in :gh:`91760`.)
+
 
 Removed
 =======
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index 933d51589f46c..a393c508d86e5 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -287,8 +287,22 @@ def seek(self, index):
         self.__next()
 
     def error(self, msg, offset=0):
+        if not self.istext:
+            msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
         return error(msg, self.string, self.tell() - offset)
 
+    def checkgroupname(self, name, offset, nested):
+        if not name.isidentifier():
+            msg = "bad character in group name %r" % name
+            raise self.error(msg, len(name) + offset)
+        if not (self.istext or name.isascii()):
+            import warnings
+            warnings.warn(
+                "bad character in group name %a at position %d" %
+                (name, self.tell() - len(name) - offset),
+                DeprecationWarning, stacklevel=nested + 7
+            )
+
 def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
@@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
                     if sourcematch("<"):
                         # named group: skip forward to end of name
                         name = source.getuntil(">", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1, nested)
                     elif sourcematch("="):
                         # named backreference
                         name = source.getuntil(")", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1, nested)
                         gid = state.groupdict.get(name)
                         if gid is None:
                             msg = "unknown group name %r" % name
@@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
                     # conditional backreference group
                     condname = source.getuntil(")", "group name")
                     if condname.isidentifier():
+                        source.checkgroupname(condname, 1, nested)
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
                             msg = "unknown group name %r" % condname
@@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
                             state.grouprefpos[condgroup] = (
                                 source.tell() - len(condname) - 1
                             )
+                        if not (condname.isdecimal() and condname.isascii()):
+                            import warnings
+                            warnings.warn(
+                                "bad character in group name %s at position %d" %
+                                (repr(condname) if source.istext else ascii(condname),
+                                 source.tell() - len(condname) - 1),
+                                DeprecationWarning, stacklevel=nested + 6
+                            )
                     state.checklookbehindgroup(condgroup, source)
                     item_yes = _parse(source, state, verbose, nested + 1)
                     if source.match("|"):
@@ -1000,11 +1019,11 @@ def addgroup(index, pos):
             # group
             c = this[1]
             if c == "g":
-                name = ""
                 if not s.match("<"):
                     raise s.error("missing <")
                 name = s.getuntil(">", "group name")
                 if name.isidentifier():
+                    s.checkgroupname(name, 1, -1)
                     try:
                         index = groupindex[name]
                     except KeyError:
@@ -1020,6 +1039,14 @@ def addgroup(index, pos):
                     if index >= MAXGROUPS:
                         raise s.error("invalid group reference %d" % index,
                                       len(name) + 1)
+                    if not (name.isdecimal() and name.isascii()):
+                        import warnings
+                        warnings.warn(
+                            "bad character in group name %s at position %d" %
+                            (repr(name) if s.istext else ascii(name),
+                             s.tell() - len(name) - 1),
+                            DeprecationWarning, stacklevel=5
+                        )
                 addgroup(index, len(name) + 1)
             elif c == "0":
                 if s.next in OCTDIGITS:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index a4c2f1f3e4ba3..c1014753802c9 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
         self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
+        self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
 
         self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@@ -274,6 +275,21 @@ def test_symbolic_groups_errors(self):
         self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
         self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
         self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\\xc2\\xb5' "
+                                   r"at position 4") as w:
+            re.compile(b'(?P<\xc2\xb5>x)')
+        self.assertEqual(w.filename, __file__)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\\xc2\\xb5' "
+                                   r"at position 4"):
+            self.checkPatternError(b'(?P=\xc2\xb5)',
+                                   r"unknown group name '\xc2\xb5'", 4)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\\xc2\\xb5' "
+                                   r"at position 3"):
+            self.checkPatternError(b'(?(\xc2\xb5)y)',
+                                   r"unknown group name '\xc2\xb5'", 3)
 
     def test_symbolic_refs(self):
         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@@ -306,12 +322,35 @@ def test_symbolic_refs_errors(self):
             re.sub('(?P<a>x)', r'\g<ab>', 'xx')
         self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
                                 "bad character in group name '-1'", 3)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\+1' "
+                                   r"at position 3") as w:
+            re.sub('(?P<a>x)', r'\g<+1>', 'xx')
+        self.assertEqual(w.filename, __file__)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '1_0' "
+                                   r"at position 3"):
+            re.sub('()'*10, r'\g<1_0>', 'xx')
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name ' 1 ' "
+                                   r"at position 3"):
+            re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
         self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
                                 "bad character in group name '©'", 3)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\\xc2\\xb5' "
+                                   r"at position 3") as w:
+            with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
+                re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
+        self.assertEqual(w.filename, __file__)
         self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
                                 "bad character in group name '㊀'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
                                 "bad character in group name '¹'", 3)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '१' "
+                                   r"at position 3"):
+            re.sub('(?P<a>x)', r'\g<१>', 'xx')
 
     def test_re_subn(self):
         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -577,10 +616,27 @@ def test_re_groupref_exists_errors(self):
         self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
         self.checkPatternError(r'()(?(-1)a|b)',
                                "bad character in group name '-1'", 5)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '\+1' "
+                                   r"at position 5") as w:
+            re.compile(r'()(?(+1)a|b)')
+        self.assertEqual(w.filename, __file__)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '1_0' "
+                                   r"at position 23"):
+            re.compile(r'()'*10 + r'(?(1_0)a|b)')
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name ' 1 ' "
+                                   r"at position 5"):
+            re.compile(r'()(?( 1 )a|b)')
         self.checkPatternError(r'()(?(㊀)a|b)',
                                "bad character in group name '㊀'", 5)
         self.checkPatternError(r'()(?(¹)a|b)',
                                "bad character in group name '¹'", 5)
+        with self.assertWarnsRegex(DeprecationWarning,
+                                   r"bad character in group name '१' "
+                                   r"at position 5"):
+            re.compile(r'()(?(१)a|b)')
         self.checkPatternError(r'()(?(1',
                                "missing ), unterminated name", 5)
         self.checkPatternError(r'()(?(1)a',
diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst
new file mode 100644
index 0000000000000..0bddbbe093144
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst
@@ -0,0 +1,4 @@
+More strict rules will be applied for numerical group references and group
+names in regular expressions. For now, a deprecation warning is emitted for
+group references and group names which will be errors in future Python
+versions.