[Python-checkins] cpython (2.7): lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

Wed Sep 7 21:04:56 EDT 2016

https://hg.python.org/cpython/rev/26397c1ea557
changeset:   103278:26397c1ea557
branch:      2.7
parent:      103236:7aaf8cff23e5
user:        Gregory P. Smith <greg at krypto.org> [Google Inc.]
date:        Thu Sep 08 01:04:37 2016 +0000
summary:
  lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
between runs given the same Grammar.txt input regardless of the hash
randomization setting.

Backport of 186bb8dc5540 from 3.5.  Done in 2.7 per the lib2to3 exemption.

files:
  Lib/lib2to3/pgen2/driver.py      |  17 ++-
  Lib/lib2to3/pgen2/grammar.py     |  31 +++++++-
  Lib/lib2to3/pgen2/pgen.py        |   8 +-
  Lib/lib2to3/tests/support.py     |   6 +-
  Lib/lib2to3/tests/test_parser.py |  72 +++++++++++++++++++-
  Misc/NEWS                        |   4 +
  6 files changed, 118 insertions(+), 20 deletions(-)

diff --git a/Lib/lib2to3/pgen2/driver.py b/Lib/lib2to3/pgen2/driver.py
--- a/Lib/lib2to3/pgen2/driver.py
+++ b/Lib/lib2to3/pgen2/driver.py
@@ -106,16 +106,19 @@
         return self.parse_tokens(tokens, debug)
 
 
+def _generate_pickle_name(gt):
+    head, tail = os.path.splitext(gt)
+    if tail == ".txt":
+        tail = ""
+    return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+
+
 def load_grammar(gt="Grammar.txt", gp=None,
                  save=True, force=False, logger=None):
     """Load the grammar (maybe from a pickle)."""
     if logger is None:
         logger = logging.getLogger()
-    if gp is None:
-        head, tail = os.path.splitext(gt)
-        if tail == ".txt":
-            tail = ""
-        gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+    gp = _generate_pickle_name(gt) if gp is None else gp
     if force or not _newer(gp, gt):
         logger.info("Generating grammar tables from %s", gt)
         g = pgen.generate_grammar(gt)
@@ -123,8 +126,8 @@
             logger.info("Writing grammar tables to %s", gp)
             try:
                 g.dump(gp)
-            except IOError, e:
-                logger.info("Writing failed:"+str(e))
+            except IOError as e:
+                logger.info("Writing failed: %s", e)
     else:
         g = grammar.Grammar()
         g.load(gp)
diff --git a/Lib/lib2to3/pgen2/grammar.py b/Lib/lib2to3/pgen2/grammar.py
--- a/Lib/lib2to3/pgen2/grammar.py
+++ b/Lib/lib2to3/pgen2/grammar.py
@@ -13,6 +13,7 @@
 """
 
 # Python imports
+import collections
 import pickle
 
 # Local imports
@@ -85,10 +86,21 @@
         self.start = 256
 
     def dump(self, filename):
-        """Dump the grammar tables to a pickle file."""
-        f = open(filename, "wb")
-        pickle.dump(self.__dict__, f, 2)
-        f.close()
+        """Dump the grammar tables to a pickle file.
+
+        dump() recursively changes all dict to OrderedDict, so the pickled file
+        is not exactly the same as what was passed in to dump(). load() uses the
+        pickled file to create the tables, but  only changes OrderedDict to dict
+        at the top level; it does not recursively change OrderedDict to dict.
+        So, the loaded tables are different from the original tables that were
+        passed to load() in that some of the OrderedDict (from the pickled file)
+        are not changed back to dict. For parsing, this has no effect on
+        performance because OrderedDict uses dict's __getitem__ with nothing in
+        between.
+        """
+        with open(filename, "wb") as f:
+            d = _make_deterministic(self.__dict__)
+            pickle.dump(d, f, 2)
 
     def load(self, filename):
         """Load the grammar tables from a pickle file."""
@@ -126,6 +138,17 @@
         print "start", self.start
 
 
+def _make_deterministic(top):
+    if isinstance(top, dict):
+        return collections.OrderedDict(
+            sorted(((k, _make_deterministic(v)) for k, v in top.iteritems())))
+    if isinstance(top, list):
+        return [_make_deterministic(e) for e in top]
+    if isinstance(top, tuple):
+        return tuple(_make_deterministic(e) for e in top)
+    return top
+
+
 # Map from operator to number (since tokenize doesn't do this)
 
 opmap_raw = """
diff --git a/Lib/lib2to3/pgen2/pgen.py b/Lib/lib2to3/pgen2/pgen.py
--- a/Lib/lib2to3/pgen2/pgen.py
+++ b/Lib/lib2to3/pgen2/pgen.py
@@ -39,7 +39,7 @@
             states = []
             for state in dfa:
                 arcs = []
-                for label, next in state.arcs.iteritems():
+                for label, next in sorted(state.arcs.iteritems()):
                     arcs.append((self.make_label(c, label), dfa.index(next)))
                 if state.isfinal:
                     arcs.append((0, dfa.index(state)))
@@ -52,7 +52,7 @@
     def make_first(self, c, name):
         rawfirst = self.first[name]
         first = {}
-        for label in rawfirst:
+        for label in sorted(rawfirst):
             ilabel = self.make_label(c, label)
             ##assert ilabel not in first # XXX failed on <> ... !=
             first[ilabel] = 1
@@ -192,7 +192,7 @@
                 for label, next in nfastate.arcs:
                     if label is not None:
                         addclosure(next, arcs.setdefault(label, {}))
-            for label, nfaset in arcs.iteritems():
+            for label, nfaset in sorted(arcs.iteritems()):
                 for st in states:
                     if st.nfaset == nfaset:
                         break
@@ -222,7 +222,7 @@
         print "Dump of DFA for", name
         for i, state in enumerate(dfa):
             print "  State", i, state.isfinal and "(final)" or ""
-            for label, next in state.arcs.iteritems():
+            for label, next in sorted(state.arcs.iteritems()):
                 print "    %s -> %d" % (label, dfa.index(next))
 
     def simplify_dfa(self, dfa):
diff --git a/Lib/lib2to3/tests/support.py b/Lib/lib2to3/tests/support.py
--- a/Lib/lib2to3/tests/support.py
+++ b/Lib/lib2to3/tests/support.py
@@ -11,13 +11,13 @@
 
 # Local imports
 from lib2to3 import pytree, refactor
-from lib2to3.pgen2 import driver
+from lib2to3.pgen2 import driver as pgen2_driver
 
 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
 grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
-grammar = driver.load_grammar(grammar_path)
-driver = driver.Driver(grammar, convert=pytree.convert)
+grammar = pgen2_driver.load_grammar(grammar_path)
+driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
 
 def parse_string(string):
     return driver.parse_string(reformat(string), debug=True)
diff --git a/Lib/lib2to3/tests/test_parser.py b/Lib/lib2to3/tests/test_parser.py
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -6,17 +6,20 @@
 test_grammar.py files from both Python 2 and Python 3.
 """
 
-from __future__ import with_statement
-
 # Testing imports
 from . import support
 from .support import driver, test_dir
 
 # Python imports
 import os
+import shutil
+import subprocess
 import sys
+import tempfile
+import unittest
 
 # Local imports
+from lib2to3.pgen2 import driver as pgen2_driver
 from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 from lib2to3.pygram import python_symbols as syms
@@ -31,6 +34,71 @@
         self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
 
 
+class TestPgen2Caching(support.TestCase):
+    def test_load_grammar_from_txt_file(self):
+        pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
+
+    def test_load_grammar_from_pickle(self):
+        # Make a copy of the grammar file in a temp directory we are
+        # guaranteed to be able to write to.
+        tmpdir = tempfile.mkdtemp()
+        try:
+            grammar_copy = os.path.join(
+                    tmpdir, os.path.basename(support.grammar_path))
+            shutil.copy(support.grammar_path, grammar_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            os.unlink(grammar_copy)  # Only the pickle remains...
+            pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
+        finally:
+            shutil.rmtree(tmpdir)
+
+    @unittest.skipIf(sys.executable is None, 'sys.executable required')
+    def test_load_grammar_from_subprocess(self):
+        tmpdir = tempfile.mkdtemp()
+        tmpsubdir = os.path.join(tmpdir, 'subdir')
+        try:
+            os.mkdir(tmpsubdir)
+            grammar_base = os.path.basename(support.grammar_path)
+            grammar_copy = os.path.join(tmpdir, grammar_base)
+            grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
+            shutil.copy(support.grammar_path, grammar_copy)
+            shutil.copy(support.grammar_path, grammar_sub_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+            pickle_sub_name = pgen2_driver._generate_pickle_name(
+                     grammar_sub_copy)
+            self.assertNotEqual(pickle_name, pickle_sub_name)
+
+            # Generate a pickle file from this process.
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            # Generate a new pickle file in a subprocess with a most likely
+            # different hash randomization seed.
+            sub_env = dict(os.environ)
+            sub_env['PYTHONHASHSEED'] = 'random'
+            subprocess.check_call(
+                    [sys.executable, '-c', """
+from lib2to3.pgen2 import driver as pgen2_driver
+pgen2_driver.load_grammar(%r, save=True, force=True)
+                    """ % (grammar_sub_copy,)],
+                    env=sub_env)
+            self.assertTrue(os.path.exists(pickle_sub_name))
+
+            with open(pickle_name, 'rb') as pickle_f_1, \
+                    open(pickle_sub_name, 'rb') as pickle_f_2:
+                self.assertEqual(
+                    pickle_f_1.read(), pickle_f_2.read(),
+                    msg='Grammar caches generated using different hash seeds'
+                    ' were not identical.')
+        finally:
+            shutil.rmtree(tmpdir)
+
+
+
 class GrammarTest(support.TestCase):
     def validate(self, code):
         support.parse_string(code)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -38,6 +38,10 @@
 Library
 -------
 
+- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
+  between runs given the same Grammar.txt input regardless of the hash
+  randomization setting.
+
 - Issue #27691: Fix ssl module's parsing of GEN_RID subject alternative name
   fields in X.509 certs.
 

-- 
Repository URL: https://hg.python.org/cpython