[pypy-commit] pypy py3k: normalize all AST identifiers to NFKC

pjenvey noreply at buildbot.pypy.org
Tue Mar 12 02:43:14 CET 2013


Author: Philip Jenvey <pjenvey at underboss.org>
Branch: py3k
Changeset: r62305:c05bd31e3342
Date: 2013-03-11 18:39 -0700
http://bitbucket.org/pypy/pypy/changeset/c05bd31e3342/

Log:	normalize all AST identifiers to NFKC

diff --git a/pypy/interpreter/astcompiler/astbuilder.py b/pypy/interpreter/astcompiler/astbuilder.py
--- a/pypy/interpreter/astcompiler/astbuilder.py
+++ b/pypy/interpreter/astcompiler/astbuilder.py
@@ -118,6 +118,9 @@
         except misc.ForbiddenNameAssignment, e:
             self.error("cannot assign to %s" % (e.name,), node)
 
+    def new_identifier(self, name):
+        return misc.new_identifier(self.space, name)
+
     def set_context(self, expr, ctx):
         """Set the context of an expression to Store or Del if possible."""
         try:
@@ -163,9 +166,10 @@
         while True:
             import_name_type = import_name.type
             if import_name_type == syms.import_as_name:
-                name = import_name.children[0].value
+                name = self.new_identifier(import_name.children[0].value)
                 if len(import_name.children) == 3:
-                    as_name = import_name.children[2].value
+                    as_name = self.new_identifier(
+                        import_name.children[2].value)
                     self.check_forbidden_name(as_name, import_name.children[2])
                 else:
                     as_name = None
@@ -178,12 +182,12 @@
                 alias = self.alias_for_import_name(import_name.children[0],
                                                    store=False)
                 asname_node = import_name.children[2]
-                alias.asname = asname_node.value
+                alias.asname = self.new_identifier(asname_node.value)
                 self.check_forbidden_name(alias.asname, asname_node)
                 return alias
             elif import_name_type == syms.dotted_name:
                 if len(import_name.children) == 1:
-                    name = import_name.children[0].value
+                    name = self.new_identifier(import_name.children[0].value)
                     if store:
                         self.check_forbidden_name(name, import_name.children[0])
                     return ast.alias(name, None)
@@ -251,12 +255,12 @@
             raise AssertionError("unknown import node")
 
     def handle_global_stmt(self, global_node):
-        names = [global_node.children[i].value
+        names = [self.new_identifier(global_node.children[i].value)
                  for i in range(1, len(global_node.children), 2)]
         return ast.Global(names, global_node.lineno, global_node.column)
 
     def handle_nonlocal_stmt(self, nonlocal_node):
-        names = [nonlocal_node.children[i].value
+        names = [self.new_identifier(nonlocal_node.children[i].value)
                  for i in range(1, len(nonlocal_node.children), 2)]
         return ast.Nonlocal(names, nonlocal_node.lineno, nonlocal_node.column)
 
@@ -375,7 +379,7 @@
             test = self.handle_expr(exc.children[1])
         if child_count == 4:
             name_node = exc.children[3]
-            name = name_node.value
+            name = self.new_identifier(name_node.value)
             self.check_forbidden_name(name, name_node)
         return ast.ExceptHandler(test, name, suite, exc.lineno, exc.column)
 
@@ -433,7 +437,7 @@
 
     def handle_classdef(self, classdef_node, decorators=None):
         name_node = classdef_node.children[1]
-        name = name_node.value
+        name = self.new_identifier(name_node.value)
         self.check_forbidden_name(name, name_node)
         if len(classdef_node.children) == 4:
             # class NAME ':' suite
@@ -463,7 +467,7 @@
 
     def handle_funcdef(self, funcdef_node, decorators=None):
         name_node = funcdef_node.children[1]
-        name = name_node.value
+        name = self.new_identifier(name_node.value)
         self.check_forbidden_name(name, name_node)
         args = self.handle_arguments(funcdef_node.children[2])
         suite = 4
@@ -503,11 +507,12 @@
         return dec
 
     def handle_dotted_name(self, dotted_name_node):
-        base_value = dotted_name_node.children[0].value
+        base_value = self.new_identifier(dotted_name_node.children[0].value)
         name = ast.Name(base_value, ast.Load, dotted_name_node.lineno,
                         dotted_name_node.column)
         for i in range(2, len(dotted_name_node.children), 2):
             attr = dotted_name_node.children[i].value
+            attr = self.new_identifier(attr)
             name = ast.Attribute(name, attr, ast.Load, dotted_name_node.lineno,
                                  dotted_name_node.column)
         return name
@@ -590,6 +595,7 @@
                                                      kwdefaults)
                 else:
                     vararg = name_node.children[0].value
+                    vararg = self.new_identifier(vararg)
                     self.check_forbidden_name(vararg, name_node)
                     if len(name_node.children) > 1:
                         varargann = self.handle_expr(name_node.children[2])
@@ -603,6 +609,7 @@
             elif arg_type == tokens.DOUBLESTAR:
                 name_node = arguments_node.children[i + 1]
                 kwarg = name_node.children[0].value
+                kwarg = self.new_identifier(kwarg)
                 self.check_forbidden_name(kwarg, name_node)
                 if len(name_node.children) > 1:
                     kwargann = self.handle_expr(name_node.children[2])
@@ -633,6 +640,7 @@
                     ann = self.handle_expr(arg.children[2])
                 name_node = arg.children[0]
                 argname = name_node.value
+                argname = self.new_identifier(argname)
                 self.check_forbidden_name(argname, name_node)
                 kwonly.append(ast.arg(argname, ann))
                 i += 2
@@ -642,11 +650,12 @@
 
     def handle_arg(self, arg_node):
         name_node = arg_node.children[0]
-        self.check_forbidden_name(name_node.value, arg_node)
+        name = self.new_identifier(name_node.value)
+        self.check_forbidden_name(name, arg_node)
         ann = None
         if len(arg_node.children) == 3:
             ann = self.handle_expr(arg_node.children[2])
-        return ast.arg(name_node.value, ann)
+        return ast.arg(name, ann)
 
     def handle_stmt(self, stmt):
         stmt_type = stmt.type
@@ -972,7 +981,7 @@
             else:
                 return self.handle_call(trailer_node.children[1], left_expr)
         elif first_child.type == tokens.DOT:
-            attr = trailer_node.children[1].value
+            attr = self.new_identifier(trailer_node.children[1].value)
             return ast.Attribute(left_expr, attr, ast.Load,
                                  trailer_node.lineno, trailer_node.column)
         else:
@@ -1119,8 +1128,9 @@
         first_child = atom_node.children[0]
         first_child_type = first_child.type
         if first_child_type == tokens.NAME:
-            return ast.Name(first_child.value, ast.Load,
-                            first_child.lineno, first_child.column)
+            name = self.new_identifier(first_child.value)
+            return ast.Name(name, ast.Load, first_child.lineno,
+                            first_child.column)
         elif first_child_type == tokens.STRING:
             space = self.space
             encoding = self.compile_info.encoding
diff --git a/pypy/interpreter/astcompiler/misc.py b/pypy/interpreter/astcompiler/misc.py
--- a/pypy/interpreter/astcompiler/misc.py
+++ b/pypy/interpreter/astcompiler/misc.py
@@ -120,3 +120,18 @@
             klass = klass[:end]
 
     return "_%s%s" % (klass, name)
+
+
+def new_identifier(space, name):
+    # Check whether there are non-ASCII characters in the identifier; if
+    # so, normalize to NFKC
+    for c in name:
+        if ord(c) > 0x80:
+            break
+    else:
+        return name
+
+    from pypy.module.unicodedata.interp_ucd import ucd
+    w_name = space.wrap(name.decode('utf-8'))
+    w_id = space.call_method(ucd, 'normalize', space.wrap('NFKC'), w_name)
+    return space.unicode_w(w_id).encode('utf-8')
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1135,10 +1135,29 @@
         assert isinstance(s, ast.Str)
         assert space.eq_w(s.s, space.wrap(japan))
 
-    def test_pep3131(self):
-        assign = self.get_first_stmt("日本 = 32").targets[0]
-        assert isinstance(assign, ast.Name)
-        assert assign.id == u"日本".encode('utf-8')
+    def test_name_pep3131(self):
+        assign = self.get_first_stmt("日本 = 32")
+        assert isinstance(assign, ast.Assign)
+        name = assign.targets[0]
+        assert isinstance(name, ast.Name)
+        assert name.id == u"日本".encode('utf-8')
+
+    def test_function_pep3131(self):
+        fn = self.get_first_stmt("def µ(µ='foo'): pass")
+        assert isinstance(fn, ast.FunctionDef)
+        # µ normalized to NFKC
+        expected = u'\u03bc'.encode('utf-8')
+        assert fn.name == expected
+        assert fn.args.args[0].arg == expected
+
+    def test_import_pep3131(self):
+        im = self.get_first_stmt("from packageµ import modµ as µ")
+        assert isinstance(im, ast.ImportFrom)
+        expected = u'\u03bc'.encode('utf-8')
+        assert im.module == 'package' + expected
+        alias = im.names[0]
+        assert alias.name == 'mod' + expected
+        assert alias.asname == expected
 
     def test_issue3574(self):
         space = self.space
diff --git a/pypy/interpreter/test/test_compiler.py b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -799,6 +799,24 @@
         s = '\udcff'
         raises(UnicodeEncodeError, compile, s, 'foo', 'exec')
 
+    def test_pep3131(self):
+        r"""
+        # XXX: the 4th name is currently mishandled by narrow builds
+        class T:
+            ä = 1
+            µ = 2 # this is a compatibility character
+            蟒 = 3
+            #x󠄀 = 4
+        assert getattr(T, '\xe4') == 1
+        assert getattr(T, '\u03bc') == 2
+        assert getattr(T, '\u87d2') == 3
+        #assert getattr(T, 'x\U000E0100') == 4
+        expected = ("['__dict__', '__doc__', '__module__', '__weakref__', "
+        #            "x󠄀", "'ä', 'μ', '蟒']")
+                    "'ä', 'μ', '蟒']")
+        assert expected in str(sorted(T.__dict__.keys()))
+        """
+
     def test_unicode_identifier(self):
         c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
         d = {}


More information about the pypy-commit mailing list