[Python-checkins] gh-81548: Deprecate octal escape sequences with value larger than 0o377 (GH-91668)

Sat Apr 30 06:16:35 EDT 2022

https://github.com/python/cpython/commit/3483299a24e41a7f2e958369cb3573d7c2253e33
commit: 3483299a24e41a7f2e958369cb3573d7c2253e33
branch: main
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-04-30T13:16:27+03:00
summary:

gh-81548: Deprecate octal escape sequences with value larger than 0o377 (GH-91668)

files:
A Misc/NEWS.d/next/Core and Builtins/2022-04-18-20-25-01.gh-issue-81548.n3VYgp.rst
M Doc/reference/lexical_analysis.rst
M Doc/whatsnew/3.11.rst
M Lib/test/test_codecs.py
M Lib/test/test_string_literals.py
M Objects/bytesobject.c
M Objects/unicodeobject.c
M Parser/string_parser.c

diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst
index dba1a9dd2e15a..0e64a056a6eb2 100644
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@@ -596,6 +596,11 @@ Notes:
 (1)
    As in Standard C, up to three octal digits are accepted.
 
+   .. versionchanged:: 3.11
+      Octal escapes with value larger than ``0o377`` produce a :exc:`DeprecationWarning`.
+      In a future Python version they will be a :exc:`SyntaxWarning` and
+      eventually a :exc:`SyntaxError`.
+
 (2)
    Unlike in Standard C, exactly two hex digits are required.
 
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 1a692f2fe7f60..59c8dd6e44732 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -1055,6 +1055,12 @@ CPython bytecode changes
 Deprecated
 ==========
 
+* Octal escapes with value larger than ``0o377`` now produce
+  a :exc:`DeprecationWarning`.
+  In a future Python version they will be a :exc:`SyntaxWarning` and
+  eventually a :exc:`SyntaxError`.
+  (Contributed by Serhiy Storchaka in :issue:`81548`.)
+
 * The :mod:`lib2to3` package and ``2to3`` tool are now deprecated and may not
   be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for
   CPython).  (Contributed by Victor Stinner in :issue:`40360`.)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 5853e08819715..42c600dcb00de 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1193,7 +1193,6 @@ def test_escape(self):
         check(br"[\418]", b"[!8]")
         check(br"[\101]", b"[A]")
         check(br"[\1010]", b"[A0]")
-        check(br"[\501]", b"[A]")
         check(br"[\x41]", b"[A]")
         check(br"[\x410]", b"[A0]")
         for i in range(97, 123):
@@ -1209,6 +1208,9 @@ def test_escape(self):
             check(br"\9", b"\\9")
         with self.assertWarns(DeprecationWarning):
             check(b"\\\xfa", b"\\\xfa")
+        for i in range(0o400, 0o1000):
+            with self.assertWarns(DeprecationWarning):
+                check(rb'\%o' % i, bytes([i & 0o377]))
 
     def test_errors(self):
         decode = codecs.escape_decode
@@ -2435,6 +2437,9 @@ def test_escape_decode(self):
             check(br"\9", "\\9")
         with self.assertWarns(DeprecationWarning):
             check(b"\\\xfa", "\\\xfa")
+        for i in range(0o400, 0o1000):
+            with self.assertWarns(DeprecationWarning):
+                check(rb'\%o' % i, chr(i))
 
     def test_decode_errors(self):
         decode = codecs.unicode_escape_decode
diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py
index 7231970acf19d..3a3830bcb6e96 100644
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@@ -116,6 +116,7 @@ def test_eval_str_invalid_escape(self):
             warnings.simplefilter('always', category=DeprecationWarning)
             eval("'''\n\\z'''")
         self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
         self.assertEqual(w[0].filename, '<string>')
         self.assertEqual(w[0].lineno, 1)
 
@@ -125,6 +126,32 @@ def test_eval_str_invalid_escape(self):
                 eval("'''\n\\z'''")
             exc = cm.exception
         self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
+        self.assertEqual(exc.filename, '<string>')
+        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.offset, 1)
+
+    def test_eval_str_invalid_octal_escape(self):
+        for i in range(0o400, 0o1000):
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(eval(r"'\%o'" % i), chr(i))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=DeprecationWarning)
+            eval("'''\n\\407'''")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message),
+                         r"invalid octal escape sequence '\407'")
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 1)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('error', category=DeprecationWarning)
+            with self.assertRaises(SyntaxError) as cm:
+                eval("'''\n\\407'''")
+            exc = cm.exception
+        self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
         self.assertEqual(exc.filename, '<string>')
         self.assertEqual(exc.lineno, 1)
         self.assertEqual(exc.offset, 1)
@@ -166,6 +193,7 @@ def test_eval_bytes_invalid_escape(self):
             warnings.simplefilter('always', category=DeprecationWarning)
             eval("b'''\n\\z'''")
         self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
         self.assertEqual(w[0].filename, '<string>')
         self.assertEqual(w[0].lineno, 1)
 
@@ -175,6 +203,31 @@ def test_eval_bytes_invalid_escape(self):
                 eval("b'''\n\\z'''")
             exc = cm.exception
         self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
+        self.assertEqual(exc.filename, '<string>')
+        self.assertEqual(exc.lineno, 1)
+
+    def test_eval_bytes_invalid_octal_escape(self):
+        for i in range(0o400, 0o1000):
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(eval(r"b'\%o'" % i), bytes([i & 0o377]))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=DeprecationWarning)
+            eval("b'''\n\\407'''")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message),
+                         r"invalid octal escape sequence '\407'")
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 1)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('error', category=DeprecationWarning)
+            with self.assertRaises(SyntaxError) as cm:
+                eval("b'''\n\\407'''")
+            exc = cm.exception
+        self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
         self.assertEqual(exc.filename, '<string>')
         self.assertEqual(exc.lineno, 1)
 
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-04-18-20-25-01.gh-issue-81548.n3VYgp.rst b/Misc/NEWS.d/next/Core and Builtins/2022-04-18-20-25-01.gh-issue-81548.n3VYgp.rst
new file mode 100644
index 0000000000000..56b1fd63b71c0
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-04-18-20-25-01.gh-issue-81548.n3VYgp.rst	
@@ -0,0 +1,3 @@
+Octal escapes with value larger than ``0o377`` now produce a
+:exc:`DeprecationWarning`. In a future Python version they will be a
+:exc:`SyntaxWarning` and eventually a :exc:`SyntaxError`.
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 510a836e6b152..b5066d0986c7b 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1113,6 +1113,12 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
                 if (s < end && '0' <= *s && *s <= '7')
                     c = (c<<3) + *s++ - '0';
             }
+            if (c > 0377) {
+                if (*first_invalid_escape == NULL) {
+                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
+                                                    already incremented s. */
+                }
+            }
             *p++ = c;
             break;
         case 'x':
@@ -1179,11 +1185,24 @@ PyObject *PyBytes_DecodeEscape(const char *s,
     if (result == NULL)
         return NULL;
     if (first_invalid_escape != NULL) {
-        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                             "invalid escape sequence '\\%c'",
-                             (unsigned char)*first_invalid_escape) < 0) {
-            Py_DECREF(result);
-            return NULL;
+        unsigned char c = *first_invalid_escape;
+        if ('4' <= c && c <= '7') {
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                                 "invalid octal escape sequence '\\%.3s'",
+                                 first_invalid_escape) < 0)
+            {
+                Py_DECREF(result);
+                return NULL;
+            }
+        }
+        else {
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                                 "invalid escape sequence '\\%c'",
+                                 c) < 0)
+            {
+                Py_DECREF(result);
+                return NULL;
+            }
         }
     }
     return result;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7768f66c95655..493307556529b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6404,6 +6404,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                     ch = (ch<<3) + *s++ - '0';
                 }
             }
+            if (ch > 0377) {
+                if (*first_invalid_escape == NULL) {
+                    *first_invalid_escape = s-3; /* Back up 3 chars, since we've
+                                                    already incremented s. */
+                }
+            }
             WRITE_CHAR(ch);
             continue;
 
@@ -6554,11 +6560,24 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
     if (result == NULL)
         return NULL;
     if (first_invalid_escape != NULL) {
-        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                             "invalid escape sequence '\\%c'",
-                             (unsigned char)*first_invalid_escape) < 0) {
-            Py_DECREF(result);
-            return NULL;
+        unsigned char c = *first_invalid_escape;
+        if ('4' <= c && c <= '7') {
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                                 "invalid octal escape sequence '\\%.3s'",
+                                 first_invalid_escape) < 0)
+            {
+                Py_DECREF(result);
+                return NULL;
+            }
+        }
+        else {
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                                 "invalid escape sequence '\\%c'",
+                                 c) < 0)
+            {
+                Py_DECREF(result);
+                return NULL;
+            }
         }
     }
     return result;
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index 65ddd46874b20..9c12d8ca101d0 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -9,10 +9,15 @@
 //// STRING HANDLING FUNCTIONS ////
 
 static int
-warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
 {
+    unsigned char c = *first_invalid_escape;
+    int octal = ('4' <= c && c <= '7');
     PyObject *msg =
-        PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
+        octal
+        ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
+                               first_invalid_escape)
+        : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
     if (msg == NULL) {
         return -1;
     }
@@ -27,7 +32,13 @@ warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char,
                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
                error location, if p->known_err_token is not set. */
             p->known_err_token = t;
-            RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
+            if (octal) {
+                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
+                                   first_invalid_escape);
+            }
+            else {
+                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+            }
         }
         Py_DECREF(msg);
         return -1;
@@ -118,7 +129,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
     v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
 
     if (v != NULL && first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
             /* We have not decref u before because first_invalid_escape points
                inside u. */
             Py_XDECREF(u);
@@ -140,7 +151,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
     }
 
     if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
             Py_DECREF(result);
             return NULL;
         }
@@ -357,7 +368,7 @@ fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
             break;
         }
     }
-    
+
     if (s == expr_end) {
         if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
             RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
@@ -465,7 +476,7 @@ fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
                    decode_unicode_with_escapes(). */
                 continue;
             }
-            if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
+            if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
                 return -1;
             }
         }