[Python-checkins] bpo-12486: Document tokenize.generate_tokens() as public API (#6957)
Carol Willing
webhook-mailer at python.org
Tue Jun 5 13:26:42 EDT 2018
https://github.com/python/cpython/commit/c56b17bd8c7a3fd03859822246633d2c9586f8bd
commit: c56b17bd8c7a3fd03859822246633d2c9586f8bd
branch: master
author: Thomas Kluyver <takowl at gmail.com>
committer: Carol Willing <carolcode at willingconsulting.com>
date: 2018-06-05T10:26:39-07:00
summary:
bpo-12486: Document tokenize.generate_tokens() as public API (#6957)
* Document tokenize.generate_tokens()
* Add news file
* Add test for generate_tokens
* Document behaviour around ENCODING token
* Add generate_tokens to __all__
files:
A Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst
M Doc/library/tokenize.rst
M Lib/test/test_tokenize.py
M Lib/tokenize.py
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst
index 4c0a0ceef7dc..111289c767f3 100644
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -57,6 +57,16 @@ The primary entry point is a :term:`generator`:
:func:`.tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`.
+.. function:: generate_tokens(readline)
+
+ Tokenize a source reading unicode strings instead of bytes.
+
+ Like :func:`.tokenize`, the *readline* argument is a callable returning
+ a single line of input. However, :func:`generate_tokens` expects *readline*
+ to return a str object rather than bytes.
+
+ The result is an iterator yielding named tuples, exactly like
+ :func:`.tokenize`. It does not yield an :data:`~token.ENCODING` token.
All constants from the :mod:`token` module are also exported from
:mod:`tokenize`.
@@ -79,7 +89,8 @@ write back the modified script.
positions) may change.
It returns bytes, encoded using the :data:`~token.ENCODING` token, which
- is the first token sequence output by :func:`.tokenize`.
+ is the first token sequence output by :func:`.tokenize`. If there is no
+ encoding token in the input, it returns a str instead.
:func:`.tokenize` needs to detect the encoding of source files it tokenizes. The
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 3520a67bd42b..93e40de96e9e 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,8 +1,8 @@
from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
- open as tokenize_open, Untokenizer)
-from io import BytesIO
+ open as tokenize_open, Untokenizer, generate_tokens)
+from io import BytesIO, StringIO
import unittest
from unittest import TestCase, mock
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
@@ -919,6 +919,19 @@ def baz(): pass
DEDENT '' (7, 0) (7, 0)
""")
+class GenerateTokensTest(TokenizeTest):
+ def check_tokenize(self, s, expected):
+ # Format the tokens in s in a table format.
+ # The ENDMARKER is omitted.
+ result = []
+ f = StringIO(s)
+ for type, token, start, end, line in generate_tokens(f.readline):
+ if type == ENDMARKER:
+ break
+ type = tok_name[type]
+ result.append(f" {type:10} {token!r:13} {start} {end}")
+ self.assertEqual(result, expected.rstrip().splitlines())
+
def decistmt(s):
result = []
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 40e6a8b9297b..c78d9f7e9ee5 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -37,7 +37,7 @@
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token
-__all__ = token.__all__ + ["tokenize", "detect_encoding",
+__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
@@ -653,9 +653,12 @@ def _tokenize(readline, encoding):
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-# An undocumented, backwards compatible, API for all the places in the standard
-# library that expect to be able to use tokenize with strings
def generate_tokens(readline):
+ """Tokenize a source reading Python code as unicode strings.
+
+ This has the same API as tokenize(), except that it expects the *readline*
+ callable to return str objects instead of bytes.
+ """
return _tokenize(readline, None)
def main():
diff --git a/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst
new file mode 100644
index 000000000000..89c88e27373b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-05-17-22-14-58.bpo-12486.HBeh62.rst
@@ -0,0 +1,2 @@
+:func:`tokenize.generate_tokens` is now documented as a public API to
+tokenize unicode strings. It was previously present but undocumented.
More information about the Python-checkins
mailing list