[Python-checkins] cpython: Issue #23622: Unknown escapes in regular expressions that consist of ``'\'``

Tue Mar 24 21:58:56 CET 2015

https://hg.python.org/cpython/rev/014031a4d398
changeset:   95181:014031a4d398
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Tue Mar 24 22:58:14 2015 +0200
summary:
  Issue #23622: Unknown escapes in regular expressions that consist of ``'\'``
and ASCII letter now raise a deprecation warning and will be forbidden in
Python 3.6.

files:
  Doc/howto/regex.rst  |   2 +-
  Doc/library/re.rst   |  10 ++++++-
  Lib/sre_parse.py     |  14 +++++++++-
  Lib/test/re_tests.py |   6 ++--
  Lib/test/test_re.py  |  43 ++++++++++++++++++++++---------
  Misc/NEWS            |   4 ++
  6 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/Doc/howto/regex.rst b/Doc/howto/regex.rst
--- a/Doc/howto/regex.rst
+++ b/Doc/howto/regex.rst
@@ -1138,7 +1138,7 @@
 
 If *replacement* is a string, any backslash escapes in it are processed.  That
 is, ``\n`` is converted to a single newline character, ``\r`` is converted to a
-carriage return, and so forth. Unknown escapes such as ``\j`` are left alone.
+carriage return, and so forth. Unknown escapes such as ``\&`` are left alone.
 Backreferences, such as ``\6``, are replaced with the substring matched by the
 corresponding group in the RE.  This lets you incorporate portions of the
 original text in the resulting replacement string.
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -438,6 +438,10 @@
 .. versionchanged:: 3.3
    The ``'\u'`` and ``'\U'`` escape sequences have been added.
 
+.. deprecated-removed:: 3.5 3.6
+   Unknown escapes consist of ``'\'`` and ASCII letter now raise a
+   deprecation warning and will be forbidden in Python 3.6.
+
 
 .. seealso::
 
@@ -687,7 +691,7 @@
    *string* is returned unchanged.  *repl* can be a string or a function; if it is
    a string, any backslash escapes in it are processed.  That is, ``\n`` is
    converted to a single newline character, ``\r`` is converted to a carriage return, and
-   so forth.  Unknown escapes such as ``\j`` are left alone.  Backreferences, such
+   so forth.  Unknown escapes such as ``\&`` are left alone.  Backreferences, such
    as ``\6``, are replaced with the substring matched by group 6 in the pattern.
    For example:
 
@@ -732,6 +736,10 @@
    .. versionchanged:: 3.5
       Unmatched groups are replaced with an empty string.
 
+   .. deprecated-removed:: 3.5 3.6
+      Unknown escapes consist of ``'\'`` and ASCII letter now raise a
+      deprecation warning and will be forbidden in Python 3.6.
+
 
 .. function:: subn(pattern, repl, string, count=0, flags=0)
 
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -21,6 +21,7 @@
 
 OCTDIGITS = frozenset("01234567")
 HEXDIGITS = frozenset("0123456789abcdefABCDEF")
+ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 
 WHITESPACE = frozenset(" \t\n\r\v\f")
 
@@ -344,6 +345,10 @@
         elif c in DIGITS:
             raise ValueError
         if len(escape) == 2:
+            if c in ASCIILETTERS:
+                import warnings
+                warnings.warn('bad escape %s' % escape,
+                              DeprecationWarning, stacklevel=8)
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
@@ -407,6 +412,10 @@
                 return GROUPREF, group
             raise ValueError
         if len(escape) == 2:
+            if c in ASCIILETTERS:
+                import warnings
+                warnings.warn('bad escape %s' % escape,
+                              DeprecationWarning, stacklevel=8)
             return LITERAL, ord(escape[1])
     except ValueError:
         pass
@@ -903,7 +912,10 @@
                 try:
                     this = chr(ESCAPES[this][1])
                 except KeyError:
-                    pass
+                    if c in ASCIILETTERS:
+                        import warnings
+                        warnings.warn('bad escape %s' % this,
+                                      DeprecationWarning, stacklevel=5)
                 lappend(this)
         else:
             lappend(this)
diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py
@@ -87,7 +87,7 @@
     (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
     # NOTE: not an error under PCRE/PRE:
     (r'\u', '', SYNTAX_ERROR),    # A Perl escape
-    (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
+    # (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
     (r'\xff', '\377', SUCCEED, 'found', chr(255)),
     # new \x semantics
     (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
@@ -607,8 +607,8 @@
     # new \x semantics
     (r'\x00ff', '\377', FAIL),
     # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
-    (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
-    ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
+    (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'),
+    ('\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', '\t\n\v\r\f\a'),
     (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
     (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
 
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -100,11 +100,14 @@
         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
 
-        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
-                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
-        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
-        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
-                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
+        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
+        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
+        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
+                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
+        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
+            with self.subTest(c):
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
 
         self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
 
@@ -551,14 +554,23 @@
         self.assertEqual(re.match(r"\(", '(').group(), '(')
         self.assertIsNone(re.match(r"\(", ')'))
         self.assertEqual(re.match(r"\\", '\\').group(), '\\')
-        self.assertEqual(re.match(r"\y", 'y').group(), 'y')
-        self.assertIsNone(re.match(r"\y", 'z'))
         self.assertEqual(re.match(r"[\]]", ']').group(), ']')
         self.assertIsNone(re.match(r"[\]]", '['))
         self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
         self.assertIsNone(re.match(r"[a\-c]", 'b'))
         self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
         self.assertIsNone(re.match(r"[\^a]+", 'b'))
+        re.purge()  # for warnings
+        for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
+            with self.subTest(c):
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
+                    self.assertIsNone(re.match('\\%c' % c, 'a'))
+        for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
+            with self.subTest(c):
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
+                    self.assertIsNone(re.match('[\\%c]' % c, 'a'))
 
     def test_string_boundaries(self):
         # See http://bugs.python.org/issue10713
@@ -907,8 +919,10 @@
             self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
             self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
             self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
-        self.assertTrue(re.match(br"\u", b'u'))
-        self.assertTrue(re.match(br"\U", b'U'))
+        with self.assertWarns(DeprecationWarning):
+            self.assertTrue(re.match(br"\u1234", b'u1234'))
+        with self.assertWarns(DeprecationWarning):
+            self.assertTrue(re.match(br"\U00012345", b'U00012345'))
         self.assertTrue(re.match(br"\0", b"\000"))
         self.assertTrue(re.match(br"\08", b"\0008"))
         self.assertTrue(re.match(br"\01", b"\001"))
@@ -928,8 +942,10 @@
             self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
             self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
             self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
-        self.assertTrue(re.match(br"[\u]", b'u'))
-        self.assertTrue(re.match(br"[\U]", b'U'))
+        with self.assertWarns(DeprecationWarning):
+            self.assertTrue(re.match(br"[\u1234]", b'u'))
+        with self.assertWarns(DeprecationWarning):
+            self.assertTrue(re.match(br"[\U00012345]", b'U'))
         self.assertRaises(re.error, re.match, br"[\567]", b"")
         self.assertRaises(re.error, re.match, br"[\911]", b"")
         self.assertRaises(re.error, re.match, br"[\x1z]", b"")
@@ -1304,8 +1320,9 @@
     def test_bug_13899(self):
         # Issue #13899: re pattern r"[\A]" should work like "A" but matches
         # nothing. Ditto B and Z.
-        self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
-                         ['A', 'B', '\b', 'C', 'Z'])
+        with self.assertWarns(DeprecationWarning):
+            self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
+                             ['A', 'B', '\b', 'C', 'Z'])
 
     @bigmemtest(size=_2G, memuse=1)
     def test_large_search(self, size):
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -30,6 +30,10 @@
 Library
 -------
 
+- Issue #23622: Unknown escapes in regular expressions that consist of ``'\'``
+  and ASCII letter now raise a deprecation warning and will be forbidden in
+  Python 3.6.
+
 - Issue #23671: string.Template now allows to specify the "self" parameter as
   keyword argument.  string.Formatter now allows to specify the "self" and
   the "format_string" parameters as keyword arguments.

-- 
Repository URL: https://hg.python.org/cpython