[Python-checkins] r76250 - in sandbox/trunk/2to3/lib2to3: pgen2/tokenize.py tests/data/bom.py tests/test_parser.py tests/test_refactor.py

Fri Nov 13 23:56:49 CET 2009

Author: benjamin.peterson
Date: Fri Nov 13 23:56:48 2009
New Revision: 76250

Log:
fix handling of a utf-8 bom #7313

Added:
   sandbox/trunk/2to3/lib2to3/tests/data/bom.py   (contents, props changed)
Modified:
   sandbox/trunk/2to3/lib2to3/pgen2/tokenize.py
   sandbox/trunk/2to3/lib2to3/tests/test_parser.py
   sandbox/trunk/2to3/lib2to3/tests/test_refactor.py

Modified: sandbox/trunk/2to3/lib2to3/pgen2/tokenize.py
==============================================================================

--- sandbox/trunk/2to3/lib2to3/pgen2/tokenize.py	(original)
+++ sandbox/trunk/2to3/lib2to3/pgen2/tokenize.py	Fri Nov 13 23:56:48 2009
@@ -281,9 +281,13 @@
             # This behaviour mimics the Python interpreter
             raise SyntaxError("unknown encoding: " + encoding)
 
-        if bom_found and codec.name != 'utf-8':
-            # This behaviour mimics the Python interpreter
-            raise SyntaxError('encoding problem: utf-8')
+        if bom_found:
+            if codec.name != 'utf-8':
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError('encoding problem: utf-8')
+            else:
+                # Allow it to be properly encoded and decoded.
+                encoding = 'utf-8-sig'
         return encoding
 
     first = read_or_stop()

Added: sandbox/trunk/2to3/lib2to3/tests/data/bom.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/lib2to3/tests/data/bom.py	Fri Nov 13 23:56:48 2009
@@ -0,0 +1,3 @@
+# coding: utf-8
+print "BOM BOOM!"
+

Modified: sandbox/trunk/2to3/lib2to3/tests/test_parser.py
==============================================================================
--- sandbox/trunk/2to3/lib2to3/tests/test_parser.py	(original)
+++ sandbox/trunk/2to3/lib2to3/tests/test_parser.py	Fri Nov 13 23:56:48 2009
@@ -161,6 +161,7 @@
             tree = driver.parse_string(source)
             new = unicode(tree)
             if diff(filepath, new, encoding):
+                import pdb; pdb.set_trace()
                 self.fail("Idempotency failed: %s" % filepath)
 
     def test_extended_unpacking(self):

Modified: sandbox/trunk/2to3/lib2to3/tests/test_refactor.py
==============================================================================
--- sandbox/trunk/2to3/lib2to3/tests/test_refactor.py	(original)
+++ sandbox/trunk/2to3/lib2to3/tests/test_refactor.py	Fri Nov 13 23:56:48 2009
@@ -4,6 +4,7 @@
 
 import sys
 import os
+import codecs
 import operator
 import StringIO
 import tempfile
@@ -177,10 +178,12 @@
 
         try:
             rt.refactor_file(test_file, True)
-            self.assertNotEqual(old_contents, read_file())
+            new_contents = read_file()
+            self.assertNotEqual(old_contents, new_contents)
         finally:
             with open(test_file, "wb") as fp:
                 fp.write(old_contents)
+        return new_contents
 
     def test_refactor_file(self):
         test_file = os.path.join(FIXER_DIR, "parrot_example.py")
@@ -221,6 +224,11 @@
         fn = os.path.join(TEST_DATA_DIR, "different_encoding.py")
         self.check_file_refactoring(fn)
 
+    def test_bom(self):
+        fn = os.path.join(TEST_DATA_DIR, "bom.py")
+        data = self.check_file_refactoring(fn)
+        self.assertTrue(data.startswith(codecs.BOM_UTF8))
+
     def test_crlf_newlines(self):
         old_sep = os.linesep
         os.linesep = "\r\n"