[Python-checkins] [3.12] gh-105564: Don't include artificial newlines in the line attribute of tokens (GH-105565) (#105579)

Fri Jun 9 12:58:22 EDT 2023

https://github.com/python/cpython/commit/16b1cdc87c08c01294b66257a26574725b005c50
commit: 16b1cdc87c08c01294b66257a26574725b005c50
branch: 3.12
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2023-06-09T16:58:14Z
summary:

[3.12] gh-105564: Don't include artificial newlines in the line attribute of tokens (GH-105565) (#105579)

Co-authored-by: Pablo Galindo Salgado <Pablogsal at gmail.com>

files:
A Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst
M Lib/test/test_tokenize.py
M Python/Python-tokenize.c

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 6747b0d8f65a1..2c124f062e7fd 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1229,7 +1229,7 @@ def readline():
         # skip the initial encoding token and the end tokens
         tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
                       extra_tokens=True))[:-2]
-        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
 
@@ -1638,8 +1638,8 @@ def test_comment_at_the_end_of_the_source_without_newline(self):
             TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'),
             TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'),
             TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'),
-            TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test\n'),
-            TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test\n'),
+            TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'),
+            TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'),
             TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='')
         ]
 
@@ -1653,7 +1653,7 @@ def test_newline_and_space_at_the_end_of_the_source_without_newline(self):
             TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
             TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'),
             TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'),
-            TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' \n'),
+            TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '),
             TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='')
         ]
 
@@ -1889,10 +1889,10 @@ def readline(encoding):
             yield "1+1".encode(encoding)
 
         expected = [
-            TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
-            TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
-            TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
-            TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1\n'),
+            TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'),
+            TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1'),
+            TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'),
+            TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'),
             TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
         ]
         for encoding in ["utf-8", "latin-1", "utf-16"]:
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst
new file mode 100644
index 0000000000000..9809fac49164f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst	
@@ -0,0 +1,2 @@
+Don't include artificil newlines in the ``line`` attribute of tokens in the
+APIs of the :mod:`tokenize` module. Patch by Pablo Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 2cf052a0cdeb3..1938562706914 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -206,6 +206,9 @@ tokenizeriter_next(tokenizeriterobject *it)
         line = PyUnicode_FromString("");
     } else {
         Py_ssize_t size = it->tok->inp - line_start;
+        if (size >= 1 && it->tok->implicit_newline) {
+            size -= 1;
+        }
         line = PyUnicode_DecodeUTF8(line_start, size, "replace");
     }
     if (line == NULL) {