[pypy-commit] pypy py3.5-fstring-pep498: This version sticks to CPython more closely
arigo
pypy.commits at gmail.com
Tue Jan 24 04:42:31 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5-fstring-pep498
Changeset: r89722:d4206d15e05e
Date: 2017-01-24 10:41 +0100
http://bitbucket.org/pypy/pypy/changeset/d4206d15e05e/
Log: This version sticks to CPython more closely
diff --git a/pypy/interpreter/astcompiler/consts.py b/pypy/interpreter/astcompiler/consts.py
--- a/pypy/interpreter/astcompiler/consts.py
+++ b/pypy/interpreter/astcompiler/consts.py
@@ -33,7 +33,6 @@
PyCF_IGNORE_COOKIE = 0x0800
PyCF_ACCEPT_NULL_BYTES = 0x10000000 # PyPy only, for compile()
PyCF_FOUND_ENCODING = 0x20000000 # PyPy only, for pytokenizer
-PyCF_REFUSE_COMMENTS = 0x40000000 # PyPy only, for f-strings
# Masks and values used by FORMAT_VALUE opcode
FVC_MASK = 0x3
diff --git a/pypy/interpreter/astcompiler/fstring.py b/pypy/interpreter/astcompiler/fstring.py
--- a/pypy/interpreter/astcompiler/fstring.py
+++ b/pypy/interpreter/astcompiler/fstring.py
@@ -1,6 +1,8 @@
from pypy.interpreter.astcompiler import ast, consts
from pypy.interpreter.pyparser import parsestring
from pypy.interpreter import error
+from pypy.interpreter import unicodehelper
+from rpython.rlib.rstring import UnicodeBuilder
def add_constant_string(astbuilder, joined_pieces, w_string, atom_node):
@@ -46,96 +48,261 @@
info = pyparse.CompileInfo("<fstring>", "eval",
consts.PyCF_SOURCE_IS_UTF8 |
- consts.PyCF_IGNORE_COOKIE |
- consts.PyCF_REFUSE_COMMENTS,
+ consts.PyCF_IGNORE_COOKIE,
optimize=astbuilder.compile_info.optimize)
parse_tree = astbuilder.recursive_parser.parse_source(source, info)
return ast_from_node(astbuilder.space, parse_tree, info)
-def f_string_expr(astbuilder, joined_pieces, u, start, atom_node, rec=0):
- conversion = -1 # the conversion char. -1 if not specified.
+
+def unexpected_end_of_string(astbuilder, atom_node):
+ astbuilder.error("f-string: expecting '}'", atom_node)
+
+
+def fstring_find_expr(astbuilder, fstr, atom_node, rec):
+ # Parse the f-string at fstr.current_index. We know it starts an
+ # expression (so it must be at '{'). Returns the FormattedValue node,
+ # which includes the expression, conversion character, and
+ # format_spec expression.
+ conversion = -1 # the conversion char. -1 if not specified.
format_spec = None
- nested_depth = 0 # nesting level for braces/parens/brackets in exprs
- p = start
- while p < len(u):
- ch = u[p]
- p += 1
- if ch in u'[{(':
+
+ # 0 if we're not in a string, else the quote char we're trying to
+ # match (single or double quote).
+ quote_char = 0
+
+ # If we're inside a string, 1=normal, 3=triple-quoted.
+ string_type = 0
+
+ # Keep track of nesting level for braces/parens/brackets in
+ # expressions.
+ nested_depth = 0
+
+ # Can only nest one level deep.
+ if rec >= 2:
+ astbuilder.error("f-string: expressions nested too deeply", atom_node)
+
+ # The first char must be a left brace, or we wouldn't have gotten
+ # here. Skip over it.
+ u = fstr.unparsed
+ i = fstr.current_index
+ assert u[i] == u'{'
+ i += 1
+
+ expr_start = i
+ while i < len(u):
+
+ # Loop invariants.
+ assert nested_depth >= 0
+ if quote_char:
+ assert string_type == 1 or string_type == 3
+ else:
+ assert string_type == 0
+
+ ch = u[i]
+ # Nowhere inside an expression is a backslash allowed.
+ if ch == u'\\':
+ # Error: can't include a backslash character, inside
+ # parens or strings or not.
+ astbuilder.error("f-string expression part "
+ "cannot include a backslash", atom_node)
+
+ if quote_char:
+ # We're inside a string. See if we're at the end.
+ # <a long comment goes here about how we're duplicating
+ # some existing logic>
+ if ord(ch) == quote_char:
+ # Does this match the string_type (single or triple
+ # quoted)?
+ if string_type == 3:
+ if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+ # We're at the end of a triple quoted string.
+ i += 3
+ string_type = 0
+ quote_char = 0
+ continue
+ else:
+ # We're at the end of a normal string.
+ i += 1
+ string_type = 0
+ quote_char = 0
+ continue
+ elif ch == u"'" or ch == u'"':
+ # Is this a triple quoted string?
+ if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+ string_type = 3
+ i += 2
+ else:
+ # Start of a normal string.
+ string_type = 1
+ # Start looking for the end of the string.
+ quote_char = ord(ch)
+ elif ch in u"[{(":
nested_depth += 1
- elif nested_depth > 0 and ch in u']})':
+ elif nested_depth != 0 and ch in u"]})":
nested_depth -= 1
- elif nested_depth == 0 and ch in u'!:}':
- # special-case '!='
- if ch == u'!' and p < len(u) and u[p] == u'=':
+ elif ch == u'#':
+ # Error: can't include a comment character, inside parens
+ # or not.
+ astbuilder.error("f-string expression part cannot include '#'",
+ atom_node)
+ elif nested_depth == 0 and ch in u"!:}":
+ # First, test for the special case of "!=". Since '=' is
+ # not an allowed conversion character, nothing is lost in
+ # this test.
+ if ch == '!' and i + 1 < len(u) and u[i+1] == u'=':
+ # This isn't a conversion character, just continue.
+ i += 1
continue
- break # normal way out of this loop
- else:
- ch = u'\x00'
- #
- if nested_depth > 0:
+ # Normal way out of this loop.
+ break
+ #else:
+ # This isn't a conversion character, just continue.
+ i += 1
+
+ # If we leave this loop in a string or with mismatched parens, we
+ # don't care. We'll get a syntax error when compiling the
+ # expression. But, we can produce a better error message, so
+ # let's just do that.
+ if quote_char:
+ astbuilder.error("f-string: unterminated string", atom_node)
+
+ if nested_depth:
astbuilder.error("f-string: mismatched '(', '{' or '['", atom_node)
- end_expression = p - 1
- if ch == u'!':
- if p + 1 < len(u):
- conversion = ord(u[p])
- ch = u[p + 1]
- p += 2
+
+ if i >= len(u):
+ unexpected_end_of_string(astbuilder, atom_node)
+
+ # Compile the expression as soon as possible, so we show errors
+ # related to the expression before errors related to the
+ # conversion or format_spec.
+ expr = f_string_compile(astbuilder, u[expr_start:i], atom_node)
+ assert isinstance(expr, ast.Expression)
+
+ # Check for a conversion char, if present.
+ if u[i] == u'!':
+ i += 1
+ if i >= len(u):
+ unexpected_end_of_string(astbuilder, atom_node)
+
+ conversion = ord(u[i])
+ i += 1
if conversion not in (ord('s'), ord('r'), ord('a')):
astbuilder.error("f-string: invalid conversion character: "
"expected 's', 'r', or 'a'", atom_node)
- if ch == u':':
- if rec >= 2:
- astbuilder.error("f-string: expressions nested too deeply",
- atom_node)
+
+ # Check for the format spec, if present.
+ if i >= len(u):
+ unexpected_end_of_string(astbuilder, atom_node)
+ if u[i] == u':':
+ i += 1
+ if i >= len(u):
+ unexpected_end_of_string(astbuilder, atom_node)
+ fstr.current_index = i
subpieces = []
- p = parse_f_string(astbuilder, subpieces, u, p, atom_node, rec + 1)
+ parse_f_string(astbuilder, subpieces, fstr, atom_node, rec + 1)
format_spec = f_string_to_ast_node(astbuilder, subpieces, atom_node)
- ch = u[p] if p >= 0 else u'\x00'
- p += 1
+ i = fstr.current_index
- if ch != u'}':
- astbuilder.error("f-string: expecting '}'", atom_node)
- end_f_string = p
- assert end_expression >= start
- expr = f_string_compile(astbuilder, u[start:end_expression], atom_node)
- assert isinstance(expr, ast.Expression)
- fval = ast.FormattedValue(expr.body, conversion, format_spec,
+ if i >= len(u) or u[i] != u'}':
+ unexpected_end_of_string(astbuilder, atom_node)
+
+ # We're at a right brace. Consume it.
+ i += 1
+ fstr.current_index = i
+
+ # And now create the FormattedValue node that represents this
+ # entire expression with the conversion and format spec.
+ return ast.FormattedValue(expr.body, conversion, format_spec,
atom_node.get_lineno(),
atom_node.get_column())
- joined_pieces.append(fval)
- return end_f_string
-def parse_f_string(astbuilder, joined_pieces, u, start, atom_node, rec=0):
+
+def fstring_find_literal(astbuilder, fstr, atom_node, rec):
+ # Return the next literal part. Updates the current index inside 'fstr'.
+ # Differs from CPython: this version handles double-braces on its own.
+ u = fstr.unparsed
+ literal_start = fstr.current_index
+ in_named_escape = False
+
+ # Get any literal string. It ends when we hit an un-doubled left
+ # brace (which isn't part of a unicode name escape such as
+ # "\N{EULER CONSTANT}"), or the end of the string.
+ i = literal_start
+ builder = UnicodeBuilder()
+ while i < len(u):
+ ch = u[i]
+ if (not in_named_escape and ch == u'{' and i - literal_start >= 2
+ and u[i - 2] == u'\\' and u[i - 1] == u'N'):
+ in_named_escape = True
+ elif in_named_escape and ch == u'}':
+ in_named_escape = False
+ elif ch == u'{' or ch == u'}':
+ # Check for doubled braces, but only at the top level. If
+ # we checked at every level, then f'{0:{3}}' would fail
+ # with the two closing braces.
+ if rec == 0 and i + 1 < len(u) and u[i + 1] == ch:
+ i += 1 # skip over the second brace
+ elif rec == 0 and ch == u'}':
+ # Where a single '{' is the start of a new expression, a
+ # single '}' is not allowed.
+ astbuilder.error("f-string: single '}' is not allowed",
+ atom_node)
+ else:
+ # We're either at a '{', which means we're starting another
+ # expression; or a '}', which means we're at the end of this
+ # f-string (for a nested format_spec).
+ break
+ builder.append(ch)
+ i += 1
+
+ fstr.current_index = i
+ literal = builder.build()
+ if not fstr.raw_mode:
+ literal = unicodehelper.decode_unicode_escape(astbuilder.space, literal)
+ return literal
+
+
+def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec):
+ # Return a tuple with the next literal part, and optionally the
+ # following expression node. Updates the current index inside 'fstr'.
+ literal = fstring_find_literal(astbuilder, fstr, atom_node, rec)
+
+ u = fstr.unparsed
+ i = fstr.current_index
+ if i >= len(u) or u[i] == u'}':
+ # We're at the end of the string or the end of a nested
+ # f-string: no expression.
+ expr = None
+ else:
+ # We must now be the start of an expression, on a '{'.
+ assert u[i] == u'{'
+ expr = fstring_find_expr(astbuilder, fstr, atom_node, rec)
+ return literal, expr
+
+
+def parse_f_string(astbuilder, joined_pieces, fstr, atom_node, rec=0):
space = astbuilder.space
- p1 = u.find(u'{', start)
- prestart = start
while True:
- if p1 < 0:
- p1 = len(u)
- p2 = u.find(u'}', start, p1)
- if p2 >= 0:
- f_constant_string(astbuilder, joined_pieces, u[prestart:p2],
- atom_node)
- pn = p2 + 1
- if pn < len(u) and u[pn] == u'}': # '}}' => single '}'
- start = pn + 1
- prestart = pn
- continue
- return p2 # found a single '}', stop here
- f_constant_string(astbuilder, joined_pieces, u[prestart:p1], atom_node)
- if p1 == len(u):
- return -1 # no more '{' or '}' left
- pn = p1 + 1
- if pn < len(u) and u[pn] == u'{': # '{{' => single '{'
- start = pn + 1
- prestart = pn
- else:
- assert u[p1] == u'{'
- start = f_string_expr(astbuilder, joined_pieces, u, pn,
- atom_node, rec)
- assert u[start - 1] == u'}'
- prestart = start
- p1 = u.find(u'{', start)
+ literal, expr = fstring_find_literal_and_expr(astbuilder, fstr,
+ atom_node, rec)
+
+ # add the literal part
+ f_constant_string(astbuilder, joined_pieces, literal, atom_node)
+
+ if expr is None:
+ break # We're done with this f-string.
+
+ joined_pieces.append(expr)
+
+ # If recurse_lvl is zero, then we must be at the end of the
+ # string. Otherwise, we must be at a right brace.
+ if rec == 0 and fstr.current_index < len(fstr.unparsed) - 1:
+ astbuilder.error("f-string: unexpected end of string", atom_node)
+
+ if rec != 0 and (fstr.current_index >= len(fstr.unparsed) or
+ fstr.unparsed[fstr.current_index] != u'}'):
+ astbuilder.error("f-string: expecting '}'", atom_node)
+
def f_string_to_ast_node(astbuilder, joined_pieces, atom_node):
# remove empty Strs
@@ -150,13 +317,14 @@
assert len(joined_pieces) > 0 # they are all empty strings
return joined_pieces[0]
+
def string_parse_literal(astbuilder, atom_node):
space = astbuilder.space
encoding = astbuilder.compile_info.encoding
joined_pieces = []
for i in range(atom_node.num_children()):
try:
- w_next, saw_f = parsestring.parsestr(
+ w_next = parsestring.parsestr(
space, encoding, atom_node.get_child(i).get_value())
except error.OperationError as e:
if not (e.match(space, space.w_UnicodeError) or
@@ -164,15 +332,10 @@
raise
# Unicode/ValueError in literal: turn into SyntaxError
raise astbuilder.error(e.errorstr(space), atom_node)
- if not saw_f:
+ if not isinstance(w_next, parsestring.W_FString):
add_constant_string(astbuilder, joined_pieces, w_next, atom_node)
else:
- p = parse_f_string(astbuilder, joined_pieces,
- space.unicode_w(w_next), 0,
- atom_node)
- if p != -1:
- astbuilder.error("f-string: single '}' is not allowed",
- atom_node)
+ parse_f_string(astbuilder, joined_pieces, w_next, atom_node)
if len(joined_pieces) == 1: # <= the common path
return joined_pieces[0] # ast.Str, Bytes or FormattedValue
# with more than one piece, it is a combination of Str and
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1182,6 +1182,8 @@
yield self.st, """x = 42; z = f'{x:5}'""", 'z', ' 42'
yield self.st, """x = 2; z = f'{5:{x:+1}0}'""", 'z', (' ' * 18 + '+5')
+ yield self.st, """z=f'{"}"}'""", 'z', '}'
+
def test_fstring_error(self):
raises(SyntaxError, self.run, "f'{}'")
raises(SyntaxError, self.run, "f'{ \t }'")
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -1,12 +1,22 @@
# coding: utf-8
+from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter import unicodehelper
from rpython.rlib.rstring import StringBuilder
+class W_FString(W_Root):
+ def __init__(self, unparsed, raw_mode):
+ assert isinstance(unparsed, unicode)
+ self.unparsed = unparsed # but the quotes are removed
+ self.raw_mode = raw_mode
+ self.current_index = 0 # for astcompiler.fstring
+
+
def parsestr(space, encoding, s):
- """Parses a string or unicode literal, and return a pair
- (wrapped value, f_string_flag).
+ """Parses a string or unicode literal, and return usually
+ a wrapped value. If we get an f-string, then instead return
+ an unparsed but unquoted W_FString instance.
If encoding=None, the source string is ascii only.
In other cases, the source string is in utf-8 encoding.
@@ -74,18 +84,17 @@
'unmatched triple quotes in literal')
q -= 2
- if saw_f:
- # forbid any '\' inside '{' and '}' pairs
- pass # XXX DO IT
-
if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now
if encoding is None:
assert 0 <= ps <= q
substr = s[ps:q]
else:
substr = decode_unicode_utf8(space, s, ps, q)
+ if saw_f:
+ v = unicodehelper.decode_utf8(space, substr)
+ return W_FString(v, rawmode)
v = unicodehelper.decode_unicode_escape(space, substr)
- return space.wrap(v), saw_f
+ return space.wrap(v)
assert 0 <= ps <= q
substr = s[ps : q]
@@ -99,13 +108,15 @@
if rawmode or '\\' not in substr:
if not unicode_literal:
- return space.newbytes(substr), saw_f
+ return space.newbytes(substr)
else:
v = unicodehelper.decode_utf8(space, substr)
- return space.wrap(v), saw_f
+ if saw_f:
+ return W_FString(v, rawmode)
+ return space.wrap(v)
v = PyString_DecodeEscape(space, substr, 'strict', encoding)
- return space.newbytes(v), saw_f
+ return space.newbytes(v)
def decode_unicode_utf8(space, s, ps, q):
# ****The Python 2.7 version, producing UTF-32 escapes****
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -187,9 +187,6 @@
continue
if line[pos] == '#':
# skip full-line comment, but still check that it is valid utf-8
- if flags & consts.PyCF_REFUSE_COMMENTS:
- raise TokenError("comments not allowed here",
- line, lnum, pos, token_list)
if not verify_utf8(line):
raise bad_utf8("comment",
line, lnum, pos, token_list, flags)
@@ -260,9 +257,6 @@
last_comment = ''
elif initial == '#':
# skip comment, but still check that it is valid utf-8
- if flags & consts.PyCF_REFUSE_COMMENTS:
- raise TokenError("comments not allowed here",
- line, lnum, start, token_list)
if not verify_utf8(token):
raise bad_utf8("comment",
line, lnum, start, token_list, flags)
More information about the pypy-commit
mailing list