[pypy-commit] pypy py3k: normalize all AST identifiers to NFKC
pjenvey
noreply at buildbot.pypy.org
Tue Mar 12 02:43:14 CET 2013
Author: Philip Jenvey <pjenvey at underboss.org>
Branch: py3k
Changeset: r62305:c05bd31e3342
Date: 2013-03-11 18:39 -0700
http://bitbucket.org/pypy/pypy/changeset/c05bd31e3342/
Log: normalize all AST identifiers to NFKC
diff --git a/pypy/interpreter/astcompiler/astbuilder.py b/pypy/interpreter/astcompiler/astbuilder.py
--- a/pypy/interpreter/astcompiler/astbuilder.py
+++ b/pypy/interpreter/astcompiler/astbuilder.py
@@ -118,6 +118,9 @@
except misc.ForbiddenNameAssignment, e:
self.error("cannot assign to %s" % (e.name,), node)
+ def new_identifier(self, name):
+ return misc.new_identifier(self.space, name)
+
def set_context(self, expr, ctx):
"""Set the context of an expression to Store or Del if possible."""
try:
@@ -163,9 +166,10 @@
while True:
import_name_type = import_name.type
if import_name_type == syms.import_as_name:
- name = import_name.children[0].value
+ name = self.new_identifier(import_name.children[0].value)
if len(import_name.children) == 3:
- as_name = import_name.children[2].value
+ as_name = self.new_identifier(
+ import_name.children[2].value)
self.check_forbidden_name(as_name, import_name.children[2])
else:
as_name = None
@@ -178,12 +182,12 @@
alias = self.alias_for_import_name(import_name.children[0],
store=False)
asname_node = import_name.children[2]
- alias.asname = asname_node.value
+ alias.asname = self.new_identifier(asname_node.value)
self.check_forbidden_name(alias.asname, asname_node)
return alias
elif import_name_type == syms.dotted_name:
if len(import_name.children) == 1:
- name = import_name.children[0].value
+ name = self.new_identifier(import_name.children[0].value)
if store:
self.check_forbidden_name(name, import_name.children[0])
return ast.alias(name, None)
@@ -251,12 +255,12 @@
raise AssertionError("unknown import node")
def handle_global_stmt(self, global_node):
- names = [global_node.children[i].value
+ names = [self.new_identifier(global_node.children[i].value)
for i in range(1, len(global_node.children), 2)]
return ast.Global(names, global_node.lineno, global_node.column)
def handle_nonlocal_stmt(self, nonlocal_node):
- names = [nonlocal_node.children[i].value
+ names = [self.new_identifier(nonlocal_node.children[i].value)
for i in range(1, len(nonlocal_node.children), 2)]
return ast.Nonlocal(names, nonlocal_node.lineno, nonlocal_node.column)
@@ -375,7 +379,7 @@
test = self.handle_expr(exc.children[1])
if child_count == 4:
name_node = exc.children[3]
- name = name_node.value
+ name = self.new_identifier(name_node.value)
self.check_forbidden_name(name, name_node)
return ast.ExceptHandler(test, name, suite, exc.lineno, exc.column)
@@ -433,7 +437,7 @@
def handle_classdef(self, classdef_node, decorators=None):
name_node = classdef_node.children[1]
- name = name_node.value
+ name = self.new_identifier(name_node.value)
self.check_forbidden_name(name, name_node)
if len(classdef_node.children) == 4:
# class NAME ':' suite
@@ -463,7 +467,7 @@
def handle_funcdef(self, funcdef_node, decorators=None):
name_node = funcdef_node.children[1]
- name = name_node.value
+ name = self.new_identifier(name_node.value)
self.check_forbidden_name(name, name_node)
args = self.handle_arguments(funcdef_node.children[2])
suite = 4
@@ -503,11 +507,12 @@
return dec
def handle_dotted_name(self, dotted_name_node):
- base_value = dotted_name_node.children[0].value
+ base_value = self.new_identifier(dotted_name_node.children[0].value)
name = ast.Name(base_value, ast.Load, dotted_name_node.lineno,
dotted_name_node.column)
for i in range(2, len(dotted_name_node.children), 2):
attr = dotted_name_node.children[i].value
+ attr = self.new_identifier(attr)
name = ast.Attribute(name, attr, ast.Load, dotted_name_node.lineno,
dotted_name_node.column)
return name
@@ -590,6 +595,7 @@
kwdefaults)
else:
vararg = name_node.children[0].value
+ vararg = self.new_identifier(vararg)
self.check_forbidden_name(vararg, name_node)
if len(name_node.children) > 1:
varargann = self.handle_expr(name_node.children[2])
@@ -603,6 +609,7 @@
elif arg_type == tokens.DOUBLESTAR:
name_node = arguments_node.children[i + 1]
kwarg = name_node.children[0].value
+ kwarg = self.new_identifier(kwarg)
self.check_forbidden_name(kwarg, name_node)
if len(name_node.children) > 1:
kwargann = self.handle_expr(name_node.children[2])
@@ -633,6 +640,7 @@
ann = self.handle_expr(arg.children[2])
name_node = arg.children[0]
argname = name_node.value
+ argname = self.new_identifier(argname)
self.check_forbidden_name(argname, name_node)
kwonly.append(ast.arg(argname, ann))
i += 2
@@ -642,11 +650,12 @@
def handle_arg(self, arg_node):
name_node = arg_node.children[0]
- self.check_forbidden_name(name_node.value, arg_node)
+ name = self.new_identifier(name_node.value)
+ self.check_forbidden_name(name, arg_node)
ann = None
if len(arg_node.children) == 3:
ann = self.handle_expr(arg_node.children[2])
- return ast.arg(name_node.value, ann)
+ return ast.arg(name, ann)
def handle_stmt(self, stmt):
stmt_type = stmt.type
@@ -972,7 +981,7 @@
else:
return self.handle_call(trailer_node.children[1], left_expr)
elif first_child.type == tokens.DOT:
- attr = trailer_node.children[1].value
+ attr = self.new_identifier(trailer_node.children[1].value)
return ast.Attribute(left_expr, attr, ast.Load,
trailer_node.lineno, trailer_node.column)
else:
@@ -1119,8 +1128,9 @@
first_child = atom_node.children[0]
first_child_type = first_child.type
if first_child_type == tokens.NAME:
- return ast.Name(first_child.value, ast.Load,
- first_child.lineno, first_child.column)
+ name = self.new_identifier(first_child.value)
+ return ast.Name(name, ast.Load, first_child.lineno,
+ first_child.column)
elif first_child_type == tokens.STRING:
space = self.space
encoding = self.compile_info.encoding
diff --git a/pypy/interpreter/astcompiler/misc.py b/pypy/interpreter/astcompiler/misc.py
--- a/pypy/interpreter/astcompiler/misc.py
+++ b/pypy/interpreter/astcompiler/misc.py
@@ -120,3 +120,18 @@
klass = klass[:end]
return "_%s%s" % (klass, name)
+
+
+def new_identifier(space, name):
+ # Check whether there are non-ASCII characters in the identifier; if
+ # so, normalize to NFKC
+ for c in name:
+ if ord(c) > 0x80:
+ break
+ else:
+ return name
+
+ from pypy.module.unicodedata.interp_ucd import ucd
+ w_name = space.wrap(name.decode('utf-8'))
+ w_id = space.call_method(ucd, 'normalize', space.wrap('NFKC'), w_name)
+ return space.unicode_w(w_id).encode('utf-8')
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1135,10 +1135,29 @@
assert isinstance(s, ast.Str)
assert space.eq_w(s.s, space.wrap(japan))
- def test_pep3131(self):
- assign = self.get_first_stmt("日本 = 32").targets[0]
- assert isinstance(assign, ast.Name)
- assert assign.id == u"日本".encode('utf-8')
+ def test_name_pep3131(self):
+ assign = self.get_first_stmt("日本 = 32")
+ assert isinstance(assign, ast.Assign)
+ name = assign.targets[0]
+ assert isinstance(name, ast.Name)
+ assert name.id == u"日本".encode('utf-8')
+
+ def test_function_pep3131(self):
+ fn = self.get_first_stmt("def µ(µ='foo'): pass")
+ assert isinstance(fn, ast.FunctionDef)
+ # µ normalized to NFKC
+ expected = u'\u03bc'.encode('utf-8')
+ assert fn.name == expected
+ assert fn.args.args[0].arg == expected
+
+ def test_import_pep3131(self):
+ im = self.get_first_stmt("from packageµ import modµ as µ")
+ assert isinstance(im, ast.ImportFrom)
+ expected = u'\u03bc'.encode('utf-8')
+ assert im.module == 'package' + expected
+ alias = im.names[0]
+ assert alias.name == 'mod' + expected
+ assert alias.asname == expected
def test_issue3574(self):
space = self.space
diff --git a/pypy/interpreter/test/test_compiler.py b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -799,6 +799,24 @@
s = '\udcff'
raises(UnicodeEncodeError, compile, s, 'foo', 'exec')
+ def test_pep3131(self):
+ r"""
+ # XXX: the 4th name is currently mishandled by narrow builds
+ class T:
+ ä = 1
+ µ = 2 # this is a compatibility character
+ 蟒 = 3
+ #x󠄀 = 4
+ assert getattr(T, '\xe4') == 1
+ assert getattr(T, '\u03bc') == 2
+ assert getattr(T, '\u87d2') == 3
+ #assert getattr(T, 'x\U000E0100') == 4
+ expected = ("['__dict__', '__doc__', '__module__', '__weakref__', "
+ # "x󠄀", "'ä', 'μ', '蟒']")
+ "'ä', 'μ', '蟒']")
+ assert expected in str(sorted(T.__dict__.keys()))
+ """
+
def test_unicode_identifier(self):
c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
d = {}
More information about the pypy-commit
mailing list