Sat May 9 21:42:24 CEST 2009

Author: benjamin.peterson
Date: Sat May  9 21:42:23 2009
New Revision: 72522

Log:
Merged revisions 72494 via svnmerge from 
svn+ssh://pythondev@svn.python.org/python/trunk

................
  r72494 | benjamin.peterson | 2009-05-08 20:01:14 -0500 (Fri, 08 May 2009) | 21 lines
  
  Merged revisions 72491-72493 via svnmerge from 
  svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3
  
  ........
    r72491 | benjamin.peterson | 2009-05-08 19:33:27 -0500 (Fri, 08 May 2009) | 7 lines
    
    make 2to3 use unicode internally on 2.x
    
    This started out as a fix for #2660, but became this large refactoring
    when I realized the dire state this was in. 2to3 now uses
    tokenize.detect_encoding to decode the files correctly into unicode.
  ........
    r72492 | benjamin.peterson | 2009-05-08 19:35:38 -0500 (Fri, 08 May 2009) | 1 line
    
    remove compat code
  ........
    r72493 | benjamin.peterson | 2009-05-08 19:54:15 -0500 (Fri, 08 May 2009) | 1 line
    
    add a test for \r\n newlines
  ........
................


Added:
   python/branches/py3k/Lib/lib2to3/tests/data/crlf.py
      - copied unchanged from r72494, /python/trunk/Lib/lib2to3/tests/data/crlf.py
   python/branches/py3k/Lib/lib2to3/tests/data/different_encoding.py
      - copied unchanged from r72494, /python/trunk/Lib/lib2to3/tests/data/different_encoding.py
Modified:
   python/branches/py3k/   (props changed)
   python/branches/py3k/Lib/lib2to3/fixes/fix_imports.py
   python/branches/py3k/Lib/lib2to3/fixes/fix_methodattrs.py
   python/branches/py3k/Lib/lib2to3/fixes/fix_renames.py
   python/branches/py3k/Lib/lib2to3/fixes/fix_types.py
   python/branches/py3k/Lib/lib2to3/main.py
   python/branches/py3k/Lib/lib2to3/patcomp.py
   python/branches/py3k/Lib/lib2to3/pgen2/driver.py
   python/branches/py3k/Lib/lib2to3/pgen2/tokenize.py
   python/branches/py3k/Lib/lib2to3/pytree.py
   python/branches/py3k/Lib/lib2to3/refactor.py
   python/branches/py3k/Lib/lib2to3/tests/support.py
   python/branches/py3k/Lib/lib2to3/tests/test_all_fixers.py
   python/branches/py3k/Lib/lib2to3/tests/test_parser.py
   python/branches/py3k/Lib/lib2to3/tests/test_refactor.py

Modified: python/branches/py3k/Lib/lib2to3/fixes/fix_imports.py
==============================================================================

--- python/branches/py3k/Lib/lib2to3/fixes/fix_imports.py	(original)
+++ python/branches/py3k/Lib/lib2to3/fixes/fix_imports.py	Sat May  9 21:42:23 2009
@@ -123,7 +123,7 @@
         import_mod = results.get("module_name")
         if import_mod:
             mod_name = import_mod.value
-            new_name = self.mapping[mod_name]
+            new_name = str(self.mapping[mod_name])
             import_mod.replace(Name(new_name, prefix=import_mod.get_prefix()))
             if "name_import" in results:
                 # If it's not a "from x import x, y" or "import x as y" import,

Modified: python/branches/py3k/Lib/lib2to3/fixes/fix_methodattrs.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/fixes/fix_methodattrs.py	(original)
+++ python/branches/py3k/Lib/lib2to3/fixes/fix_methodattrs.py	Sat May  9 21:42:23 2009
@@ -19,5 +19,5 @@
 
     def transform(self, node, results):
         attr = results["attr"][0]
-        new = MAP[attr.value]
+        new = str(MAP[attr.value])
         attr.replace(Name(new, prefix=attr.get_prefix()))

Modified: python/branches/py3k/Lib/lib2to3/fixes/fix_renames.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/fixes/fix_renames.py	(original)
+++ python/branches/py3k/Lib/lib2to3/fixes/fix_renames.py	Sat May  9 21:42:23 2009
@@ -65,5 +65,5 @@
         #import_mod = results.get("module")
 
         if mod_name and attr_name:
-            new_attr = LOOKUP[(mod_name.value, attr_name.value)]
+            new_attr = str(LOOKUP[(mod_name.value, attr_name.value)])
             attr_name.replace(Name(new_attr, prefix=attr_name.get_prefix()))

Modified: python/branches/py3k/Lib/lib2to3/fixes/fix_types.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/fixes/fix_types.py	(original)
+++ python/branches/py3k/Lib/lib2to3/fixes/fix_types.py	Sat May  9 21:42:23 2009
@@ -56,7 +56,7 @@
     PATTERN = '|'.join(_pats)
 
     def transform(self, node, results):
-        new_value = _TYPE_MAPPING.get(results["name"].value)
+        new_value = str(_TYPE_MAPPING.get(results["name"].value))
         if new_value:
             return Name(new_value, prefix=node.get_prefix())
         return None

Modified: python/branches/py3k/Lib/lib2to3/main.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/main.py	(original)
+++ python/branches/py3k/Lib/lib2to3/main.py	Sat May  9 21:42:23 2009
@@ -23,7 +23,7 @@
         self.errors.append((msg, args, kwargs))
         self.logger.error(msg, *args, **kwargs)
 
-    def write_file(self, new_text, filename, old_text):
+    def write_file(self, new_text, filename, old_text, encoding):
         if not self.nobackups:
             # Make backup
             backup = filename + ".bak"
@@ -37,8 +37,8 @@
             except os.error as err:
                 self.log_message("Can't rename %s to %s", filename, backup)
         # Actually write the new file
-        super(StdoutRefactoringTool, self).write_file(new_text,
-                                                      filename, old_text)
+        write = super(StdoutRefactoringTool, self).write_file
+        write(new_text, filename, old_text, encoding)
         if not self.nobackups:
             shutil.copymode(backup, filename)
 

Modified: python/branches/py3k/Lib/lib2to3/patcomp.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/patcomp.py	(original)
+++ python/branches/py3k/Lib/lib2to3/patcomp.py	Sat May  9 21:42:23 2009
@@ -133,7 +133,7 @@
         assert len(nodes) >= 1
         node = nodes[0]
         if node.type == token.STRING:
-            value = literals.evalString(node.value)
+            value = str(literals.evalString(node.value))
             return pytree.LeafPattern(content=value)
         elif node.type == token.NAME:
             value = node.value

Modified: python/branches/py3k/Lib/lib2to3/pgen2/driver.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/pgen2/driver.py	(original)
+++ python/branches/py3k/Lib/lib2to3/pgen2/driver.py	Sat May  9 21:42:23 2009
@@ -16,6 +16,7 @@
 __all__ = ["Driver", "load_grammar"]
 
 # Python imports
+import codecs
 import os
 import logging
 import sys
@@ -90,9 +91,9 @@
         """Parse a stream and return the syntax tree."""
         return self.parse_stream_raw(stream, debug)
 
-    def parse_file(self, filename, debug=False):
+    def parse_file(self, filename, encoding=None, debug=False):
         """Parse a file and return the syntax tree."""
-        stream = open(filename)
+        stream = codecs.open(filename, "r", encoding)
         try:
             return self.parse_stream(stream, debug)
         finally:

Modified: python/branches/py3k/Lib/lib2to3/pgen2/tokenize.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/pgen2/tokenize.py	(original)
+++ python/branches/py3k/Lib/lib2to3/pgen2/tokenize.py	Sat May  9 21:42:23 2009
@@ -30,6 +30,7 @@
     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 
 import string, re
+from codecs import BOM_UTF8, lookup
 from lib2to3.pgen2.token import *
 
 from . import token
@@ -228,6 +229,75 @@
                 startline = False
             toks_append(tokval)
 
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+def detect_encoding(readline):
+    """
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file. It requires one argment, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read
+    in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263. If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised. If the encoding cookie is an
+    invalid charset, raise a SyntaxError.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    bom_found = False
+    encoding = None
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return b''
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = matches[0]
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found and codec.name != 'utf-8':
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError('encoding problem: utf-8')
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+    if not first:
+        return 'utf-8', []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+
+    second = read_or_stop()
+    if not second:
+        return 'utf-8', [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return 'utf-8', [first, second]
+
 def untokenize(iterable):
     """Transform tokens back into Python source code.
 

Modified: python/branches/py3k/Lib/lib2to3/pytree.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/pytree.py	(original)
+++ python/branches/py3k/Lib/lib2to3/pytree.py	Sat May  9 21:42:23 2009
@@ -216,6 +216,10 @@
             return ""
         return next_sib.get_prefix()
 
+    if sys.version_info < (3, 0):
+        def __str__(self):
+            return str(self).encode("ascii")
+
 
 class Node(Base):
 
@@ -245,7 +249,7 @@
                                type_repr(self.type),
                                self.children)
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Return a pretty string representation.
 
@@ -253,6 +257,9 @@
         """
         return "".join(map(str, self.children))
 
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
     def _eq(self, other):
         """Compare two nodes for equality."""
         return (self.type, self.children) == (other.type, other.children)
@@ -353,7 +360,7 @@
                                self.type,
                                self.value)
 
-    def __str__(self):
+    def __unicode__(self):
         """
         Return a pretty string representation.
 
@@ -361,6 +368,9 @@
         """
         return self.prefix + str(self.value)
 
+    if sys.version_info > (3, 0):
+        __str__ = __unicode__
+
     def _eq(self, other):
         """Compare two nodes for equality."""
         return (self.type, self.value) == (other.type, other.value)

Modified: python/branches/py3k/Lib/lib2to3/refactor.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/refactor.py	(original)
+++ python/branches/py3k/Lib/lib2to3/refactor.py	Sat May  9 21:42:23 2009
@@ -22,8 +22,7 @@
 from itertools import chain
 
 # Local imports
-from .pgen2 import driver
-from .pgen2 import tokenize
+from .pgen2 import driver, tokenize
 
 from . import pytree
 from . import patcomp
@@ -87,6 +86,25 @@
     return [pkg_name + "." + fix_name
             for fix_name in get_all_fix_names(pkg_name, False)]
 
+def _identity(obj):
+    return obj
+
+if sys.version_info < (3, 0):
+    import codecs
+    _open_with_encoding = codecs.open
+    # codecs.open doesn't translate newlines sadly.
+    def _from_system_newlines(input):
+        return input.replace("\r\n", "\n")
+    def _to_system_newlines(input):
+        if os.linesep != "\n":
+            return input.replace("\n", os.linesep)
+        else:
+            return input
+else:
+    _open_with_encoding = open
+    _from_system_newlines = _identity
+    _to_system_newlines = _identity
+
 
 class FixerError(Exception):
     """A fixer could not be loaded."""
@@ -213,29 +231,42 @@
             # Modify dirnames in-place to remove subdirs with leading dots
             dirnames[:] = [dn for dn in dirnames if not dn.startswith(".")]
 
-    def refactor_file(self, filename, write=False, doctests_only=False):
-        """Refactors a file."""
+    def _read_python_source(self, filename):
+        """
+        Do our best to decode a Python source file correctly.
+        """
         try:
-            f = open(filename)
+            f = open(filename, "rb")
         except IOError as err:
             self.log_error("Can't open %s: %s", filename, err)
-            return
+            return None, None
         try:
-            input = f.read() + "\n" # Silence certain parse errors
+            encoding = tokenize.detect_encoding(f.readline)[0]
         finally:
             f.close()
+        with _open_with_encoding(filename, "r", encoding=encoding) as f:
+            return _from_system_newlines(f.read()), encoding
+
+    def refactor_file(self, filename, write=False, doctests_only=False):
+        """Refactors a file."""
+        input, encoding = self._read_python_source(filename)
+        if input is None:
+            # Reading the file failed.
+            return
+        input += "\n" # Silence certain parse errors
         if doctests_only:
             self.log_debug("Refactoring doctests in %s", filename)
             output = self.refactor_docstring(input, filename)
             if output != input:
-                self.processed_file(output, filename, input, write=write)
+                self.processed_file(output, filename, input, write, encoding)
             else:
                 self.log_debug("No doctest changes in %s", filename)
         else:
             tree = self.refactor_string(input, filename)
             if tree and tree.was_changed:
                 # The [:-1] is to take off the \n we added earlier
-                self.processed_file(str(tree)[:-1], filename, write=write)
+                self.processed_file(str(tree)[:-1], filename,
+                                    write=write, encoding=encoding)
             else:
                 self.log_debug("No changes in %s", filename)
 
@@ -321,31 +352,26 @@
                         node.replace(new)
                         node = new
 
-    def processed_file(self, new_text, filename, old_text=None, write=False):
+    def processed_file(self, new_text, filename, old_text=None, write=False,
+                       encoding=None):
         """
         Called when a file has been refactored, and there are changes.
         """
         self.files.append(filename)
         if old_text is None:
-            try:
-                f = open(filename, "r")
-            except IOError as err:
-                self.log_error("Can't read %s: %s", filename, err)
+            old_text = self._read_python_source(filename)[0]
+            if old_text is None:
                 return
-            try:
-                old_text = f.read()
-            finally:
-                f.close()
         if old_text == new_text:
             self.log_debug("No changes to %s", filename)
             return
         self.print_output(diff_texts(old_text, new_text, filename))
         if write:
-            self.write_file(new_text, filename, old_text)
+            self.write_file(new_text, filename, old_text, encoding)
         else:
             self.log_debug("Not writing changes to %s", filename)
 
-    def write_file(self, new_text, filename, old_text):
+    def write_file(self, new_text, filename, old_text, encoding=None):
         """Writes a string to a file.
 
         It first shows a unified diff between the old text and the new text, and
@@ -353,12 +379,12 @@
         set.
         """
         try:
-            f = open(filename, "w")
+            f = _open_with_encoding(filename, "w", encoding=encoding)
         except os.error as err:
             self.log_error("Can't create %s: %s", filename, err)
             return
         try:
-            f.write(new_text)
+            f.write(_to_system_newlines(new_text))
         except os.error as err:
             self.log_error("Can't write %s: %s", filename, err)
         finally:

Modified: python/branches/py3k/Lib/lib2to3/tests/support.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/tests/support.py	(original)
+++ python/branches/py3k/Lib/lib2to3/tests/support.py	Sat May  9 21:42:23 2009
@@ -9,12 +9,9 @@
 import re
 from textwrap import dedent
 
-#sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
 # Local imports
-from .. import pytree
-from .. import refactor
-from ..pgen2 import driver
+from lib2to3 import pytree, refactor
+from lib2to3.pgen2 import driver
 
 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
@@ -25,12 +22,6 @@
 def parse_string(string):
     return driver.parse_string(reformat(string), debug=True)
 
-# Python 2.3's TestSuite is not iter()-able
-if sys.version_info < (2, 4):
-    def TestSuite_iter(self):
-        return iter(self._tests)
-    unittest.TestSuite.__iter__ = TestSuite_iter
-
 def run_all_tests(test_mod=None, tests=None):
     if tests is None:
         tests = unittest.TestLoader().loadTestsFromModule(test_mod)

Modified: python/branches/py3k/Lib/lib2to3/tests/test_all_fixers.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/tests/test_all_fixers.py	(original)
+++ python/branches/py3k/Lib/lib2to3/tests/test_all_fixers.py	Sat May  9 21:42:23 2009
@@ -28,7 +28,7 @@
     def test_all_project_files(self):
         for filepath in support.all_project_files():
             print("Fixing %s..." % filepath)
-            self.refactor.refactor_string(open(filepath).read(), filepath)
+            self.refactor.refactor_file(filepath)
 
 
 if __name__ == "__main__":

Modified: python/branches/py3k/Lib/lib2to3/tests/test_parser.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/tests/test_parser.py	(original)
+++ python/branches/py3k/Lib/lib2to3/tests/test_parser.py	Sat May  9 21:42:23 2009
@@ -14,9 +14,9 @@
 
 # Python imports
 import os
-import os.path
 
 # Local imports
+from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 
 
@@ -150,13 +150,25 @@
     def test_all_project_files(self):
         for filepath in support.all_project_files():
             print("Parsing %s..." % filepath)
-            tree = driver.parse_file(filepath, debug=True)
-            if diff(filepath, tree):
+            with open(filepath, "rb") as fp:
+                encoding = tokenize.detect_encoding(fp.readline)[0]
+                fp.seek(0)
+                source = fp.read()
+                if encoding:
+                    source = source.decode(encoding)
+            tree = driver.parse_string(source)
+            new = str(tree)
+            if encoding:
+                new = new.encode(encoding)
+            if diff(filepath, new):
                 self.fail("Idempotency failed: %s" % filepath)
 
 
 class TestLiterals(GrammarTest):
 
+    def validate(self, s):
+        driver.parse_string(support.dedent(s) + "\n\n")
+
     def test_multiline_bytes_literals(self):
         s = """
             md5test(b"\xaa" * 80,
@@ -185,10 +197,10 @@
         self.validate(s)
 
 
-def diff(fn, tree):
+def diff(fn, result):
     f = open("@", "w")
     try:
-        f.write(str(tree))
+        f.write(result)
     finally:
         f.close()
     try:

Modified: python/branches/py3k/Lib/lib2to3/tests/test_refactor.py
==============================================================================
--- python/branches/py3k/Lib/lib2to3/tests/test_refactor.py	(original)
+++ python/branches/py3k/Lib/lib2to3/tests/test_refactor.py	Sat May  9 21:42:23 2009
@@ -14,7 +14,8 @@
 from . import support
 
 
-FIXER_DIR = os.path.join(os.path.dirname(__file__), "data/fixers")
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+FIXER_DIR = os.path.join(TEST_DATA_DIR, "fixers")
 
 sys.path.append(FIXER_DIR)
 try:
@@ -22,6 +23,8 @@
 finally:
     sys.path.pop()
 
+_2TO3_FIXERS = refactor.get_fixers_from_package("lib2to3.fixes")
+
 class TestRefactoringTool(unittest.TestCase):
 
     def setUp(self):
@@ -121,19 +124,40 @@
 +def cheese(): pass""".splitlines()
         self.assertEqual(diff_lines[:-1], expected)
 
-    def test_refactor_file(self):
-        test_file = os.path.join(FIXER_DIR, "parrot_example.py")
-        old_contents = open(test_file, "r").read()
-        rt = self.rt()
+    def check_file_refactoring(self, test_file, fixers=_2TO3_FIXERS):
+        def read_file():
+            with open(test_file, "rb") as fp:
+                return fp.read()
+        old_contents = read_file()
+        rt = self.rt(fixers=fixers)
 
         rt.refactor_file(test_file)
-        self.assertEqual(old_contents, open(test_file, "r").read())
+        self.assertEqual(old_contents, read_file())
+
+        try:
+            rt.refactor_file(test_file, True)
+            self.assertNotEqual(old_contents, read_file())
+        finally:
+            with open(test_file, "wb") as fp:
+                fp.write(old_contents)
+
+    def test_refactor_file(self):
+        test_file = os.path.join(FIXER_DIR, "parrot_example.py")
+        self.check_file_refactoring(test_file, _DEFAULT_FIXERS)
 
-        rt.refactor_file(test_file, True)
+    def test_file_encoding(self):
+        fn = os.path.join(TEST_DATA_DIR, "different_encoding.py")
+        self.check_file_refactoring(fn)
+
+    def test_crlf_newlines(self):
+        old_sep = os.linesep
+        os.linesep = "\r\n"
         try:
-            self.assertNotEqual(old_contents, open(test_file, "r").read())
+            fn = os.path.join(TEST_DATA_DIR, "crlf.py")
+            fixes = refactor.get_fixers_from_package("lib2to3.fixes")
+            self.check_file_refactoring(fn, fixes)
         finally:
-            open(test_file, "w").write(old_contents)
+            os.linesep = old_sep
 
     def test_refactor_docstring(self):
         rt = self.rt()